aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c534
1 files changed, 328 insertions, 206 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index eecf070ffd1a..fd05861b2111 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
309 */ 309 */
310static DEFINE_SPINLOCK(task_group_lock); 310static DEFINE_SPINLOCK(task_group_lock);
311 311
312#ifdef CONFIG_FAIR_GROUP_SCHED
313
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
313static int root_task_group_empty(void) 315static int root_task_group_empty(void)
314{ 316{
@@ -316,7 +318,6 @@ static int root_task_group_empty(void)
316} 318}
317#endif 319#endif
318 320
319#ifdef CONFIG_FAIR_GROUP_SCHED
320#ifdef CONFIG_USER_SCHED 321#ifdef CONFIG_USER_SCHED
321# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
322#else /* !CONFIG_USER_SCHED */ 323#else /* !CONFIG_USER_SCHED */
@@ -534,14 +535,12 @@ struct rq {
534 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
535 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
536#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
537 unsigned long last_tick_seen;
538 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
539#endif 539#endif
540 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
541 struct load_weight load; 541 struct load_weight load;
542 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
543 u64 nr_switches; 543 u64 nr_switches;
544 u64 nr_migrations_in;
545 544
546 struct cfs_rq cfs; 545 struct cfs_rq cfs;
547 struct rt_rq rt; 546 struct rt_rq rt;
@@ -590,6 +589,8 @@ struct rq {
590 589
591 u64 rt_avg; 590 u64 rt_avg;
592 u64 age_stamp; 591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
593#endif 594#endif
594 595
595 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -676,6 +677,7 @@ inline void update_rq_clock(struct rq *rq)
676 677
677/** 678/**
678 * runqueue_is_locked 679 * runqueue_is_locked
680 * @cpu: the processor in question.
679 * 681 *
680 * Returns true if the current cpu runqueue is locked. 682 * Returns true if the current cpu runqueue is locked.
681 * This interface allows printk to be called with the runqueue lock 683 * This interface allows printk to be called with the runqueue lock
@@ -770,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
770 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
771 return -EINVAL; 773 return -EINVAL;
772 774
773 filp->f_pos += cnt; 775 *ppos += cnt;
774 776
775 return cnt; 777 return cnt;
776} 778}
@@ -812,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
812 * default: 0.25ms 814 * default: 0.25ms
813 */ 815 */
814unsigned int sysctl_sched_shares_ratelimit = 250000; 816unsigned int sysctl_sched_shares_ratelimit = 250000;
817unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815 818
816/* 819/*
817 * Inject some fuzzyness into changing the per-cpu group shares 820 * Inject some fuzzyness into changing the per-cpu group shares
@@ -1612,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1612 */ 1615 */
1613static int tg_shares_up(struct task_group *tg, void *data) 1616static int tg_shares_up(struct task_group *tg, void *data)
1614{ 1617{
1615 unsigned long weight, rq_weight = 0, shares = 0; 1618 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1616 unsigned long *usd_rq_weight; 1619 unsigned long *usd_rq_weight;
1617 struct sched_domain *sd = data; 1620 struct sched_domain *sd = data;
1618 unsigned long flags; 1621 unsigned long flags;
@@ -1628,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1628 weight = tg->cfs_rq[i]->load.weight; 1631 weight = tg->cfs_rq[i]->load.weight;
1629 usd_rq_weight[i] = weight; 1632 usd_rq_weight[i] = weight;
1630 1633
1634 rq_weight += weight;
1631 /* 1635 /*
1632 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1633 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
@@ -1636,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1636 if (!weight) 1640 if (!weight)
1637 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1638 1642
1639 rq_weight += weight; 1643 sum_weight += weight;
1640 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1641 } 1645 }
1642 1646
1647 if (!rq_weight)
1648 rq_weight = sum_weight;
1649
1643 if ((!shares && rq_weight) || shares > tg->shares) 1650 if ((!shares && rq_weight) || shares > tg->shares)
1644 shares = tg->shares; 1651 shares = tg->shares;
1645 1652
@@ -1808,6 +1815,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1808#endif 1815#endif
1809 1816
1810static void calc_load_account_active(struct rq *this_rq); 1817static void calc_load_account_active(struct rq *this_rq);
1818static void update_sysctl(void);
1819static int get_update_sysctl_factor(void);
1820
1821static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1822{
1823 set_task_rq(p, cpu);
1824#ifdef CONFIG_SMP
1825 /*
1826 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1827 * successfuly executed on another CPU. We must ensure that updates of
1828 * per-task data have been completed by this moment.
1829 */
1830 smp_wmb();
1831 task_thread_info(p)->cpu = cpu;
1832#endif
1833}
1811 1834
1812#include "sched_stats.h" 1835#include "sched_stats.h"
1813#include "sched_idletask.c" 1836#include "sched_idletask.c"
@@ -1965,20 +1988,6 @@ inline int task_curr(const struct task_struct *p)
1965 return cpu_curr(task_cpu(p)) == p; 1988 return cpu_curr(task_cpu(p)) == p;
1966} 1989}
1967 1990
1968static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1969{
1970 set_task_rq(p, cpu);
1971#ifdef CONFIG_SMP
1972 /*
1973 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1974 * successfuly executed on another CPU. We must ensure that updates of
1975 * per-task data have been completed by this moment.
1976 */
1977 smp_wmb();
1978 task_thread_info(p)->cpu = cpu;
1979#endif
1980}
1981
1982static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1991static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1983 const struct sched_class *prev_class, 1992 const struct sched_class *prev_class,
1984 int oldprio, int running) 1993 int oldprio, int running)
@@ -1991,6 +2000,39 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1991 p->sched_class->prio_changed(rq, p, oldprio, running); 2000 p->sched_class->prio_changed(rq, p, oldprio, running);
1992} 2001}
1993 2002
2003/**
2004 * kthread_bind - bind a just-created kthread to a cpu.
2005 * @p: thread created by kthread_create().
2006 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2007 *
2008 * Description: This function is equivalent to set_cpus_allowed(),
2009 * except that @cpu doesn't need to be online, and the thread must be
2010 * stopped (i.e., just returned from kthread_create()).
2011 *
2012 * Function lives here instead of kthread.c because it messes with
2013 * scheduler internals which require locking.
2014 */
2015void kthread_bind(struct task_struct *p, unsigned int cpu)
2016{
2017 struct rq *rq = cpu_rq(cpu);
2018 unsigned long flags;
2019
2020 /* Must have done schedule() in kthread() before we set_task_cpu */
2021 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2022 WARN_ON(1);
2023 return;
2024 }
2025
2026 spin_lock_irqsave(&rq->lock, flags);
2027 update_rq_clock(rq);
2028 set_task_cpu(p, cpu);
2029 p->cpus_allowed = cpumask_of_cpu(cpu);
2030 p->rt.nr_cpus_allowed = 1;
2031 p->flags |= PF_THREAD_BOUND;
2032 spin_unlock_irqrestore(&rq->lock, flags);
2033}
2034EXPORT_SYMBOL(kthread_bind);
2035
1994#ifdef CONFIG_SMP 2036#ifdef CONFIG_SMP
1995/* 2037/*
1996 * Is this task likely cache-hot: 2038 * Is this task likely cache-hot:
@@ -2003,7 +2045,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2003 /* 2045 /*
2004 * Buddy candidates are cache hot: 2046 * Buddy candidates are cache hot:
2005 */ 2047 */
2006 if (sched_feat(CACHE_HOT_BUDDY) && 2048 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2007 (&p->se == cfs_rq_of(&p->se)->next || 2049 (&p->se == cfs_rq_of(&p->se)->next ||
2008 &p->se == cfs_rq_of(&p->se)->last)) 2050 &p->se == cfs_rq_of(&p->se)->last))
2009 return 1; 2051 return 1;
@@ -2025,30 +2067,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2025void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2067void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2026{ 2068{
2027 int old_cpu = task_cpu(p); 2069 int old_cpu = task_cpu(p);
2028 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2029 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2070 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2030 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2071 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2031 u64 clock_offset;
2032
2033 clock_offset = old_rq->clock - new_rq->clock;
2034 2072
2035 trace_sched_migrate_task(p, new_cpu); 2073 trace_sched_migrate_task(p, new_cpu);
2036 2074
2037#ifdef CONFIG_SCHEDSTATS
2038 if (p->se.wait_start)
2039 p->se.wait_start -= clock_offset;
2040 if (p->se.sleep_start)
2041 p->se.sleep_start -= clock_offset;
2042 if (p->se.block_start)
2043 p->se.block_start -= clock_offset;
2044#endif
2045 if (old_cpu != new_cpu) { 2075 if (old_cpu != new_cpu) {
2046 p->se.nr_migrations++; 2076 p->se.nr_migrations++;
2047 new_rq->nr_migrations_in++;
2048#ifdef CONFIG_SCHEDSTATS
2049 if (task_hot(p, old_rq->clock, NULL))
2050 schedstat_inc(p, se.nr_forced2_migrations);
2051#endif
2052 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2077 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2053 1, 1, NULL, 0); 2078 1, 1, NULL, 0);
2054 } 2079 }
@@ -2081,6 +2106,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2081 * it is sufficient to simply update the task's cpu field. 2106 * it is sufficient to simply update the task's cpu field.
2082 */ 2107 */
2083 if (!p->se.on_rq && !task_running(rq, p)) { 2108 if (!p->se.on_rq && !task_running(rq, p)) {
2109 update_rq_clock(rq);
2084 set_task_cpu(p, dest_cpu); 2110 set_task_cpu(p, dest_cpu);
2085 return 0; 2111 return 0;
2086 } 2112 }
@@ -2288,6 +2314,14 @@ void task_oncpu_function_call(struct task_struct *p,
2288 preempt_enable(); 2314 preempt_enable();
2289} 2315}
2290 2316
2317#ifdef CONFIG_SMP
2318static inline
2319int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2320{
2321 return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2322}
2323#endif
2324
2291/*** 2325/***
2292 * try_to_wake_up - wake up a thread 2326 * try_to_wake_up - wake up a thread
2293 * @p: the to-be-woken-up thread 2327 * @p: the to-be-woken-up thread
@@ -2307,7 +2341,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2307{ 2341{
2308 int cpu, orig_cpu, this_cpu, success = 0; 2342 int cpu, orig_cpu, this_cpu, success = 0;
2309 unsigned long flags; 2343 unsigned long flags;
2310 struct rq *rq; 2344 struct rq *rq, *orig_rq;
2311 2345
2312 if (!sched_feat(SYNC_WAKEUPS)) 2346 if (!sched_feat(SYNC_WAKEUPS))
2313 wake_flags &= ~WF_SYNC; 2347 wake_flags &= ~WF_SYNC;
@@ -2315,7 +2349,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2315 this_cpu = get_cpu(); 2349 this_cpu = get_cpu();
2316 2350
2317 smp_wmb(); 2351 smp_wmb();
2318 rq = task_rq_lock(p, &flags); 2352 rq = orig_rq = task_rq_lock(p, &flags);
2319 update_rq_clock(rq); 2353 update_rq_clock(rq);
2320 if (!(p->state & state)) 2354 if (!(p->state & state))
2321 goto out; 2355 goto out;
@@ -2339,13 +2373,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2339 if (task_contributes_to_load(p)) 2373 if (task_contributes_to_load(p))
2340 rq->nr_uninterruptible--; 2374 rq->nr_uninterruptible--;
2341 p->state = TASK_WAKING; 2375 p->state = TASK_WAKING;
2342 task_rq_unlock(rq, &flags); 2376 __task_rq_unlock(rq);
2343 2377
2344 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2378 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2345 if (cpu != orig_cpu) 2379 if (cpu != orig_cpu)
2346 set_task_cpu(p, cpu); 2380 set_task_cpu(p, cpu);
2347 2381
2348 rq = task_rq_lock(p, &flags); 2382 rq = __task_rq_lock(p);
2383 update_rq_clock(rq);
2384
2349 WARN_ON(p->state != TASK_WAKING); 2385 WARN_ON(p->state != TASK_WAKING);
2350 cpu = task_cpu(p); 2386 cpu = task_cpu(p);
2351 2387
@@ -2402,6 +2438,17 @@ out_running:
2402#ifdef CONFIG_SMP 2438#ifdef CONFIG_SMP
2403 if (p->sched_class->task_wake_up) 2439 if (p->sched_class->task_wake_up)
2404 p->sched_class->task_wake_up(rq, p); 2440 p->sched_class->task_wake_up(rq, p);
2441
2442 if (unlikely(rq->idle_stamp)) {
2443 u64 delta = rq->clock - rq->idle_stamp;
2444 u64 max = 2*sysctl_sched_migration_cost;
2445
2446 if (delta > max)
2447 rq->avg_idle = max;
2448 else
2449 update_avg(&rq->avg_idle, delta);
2450 rq->idle_stamp = 0;
2451 }
2405#endif 2452#endif
2406out: 2453out:
2407 task_rq_unlock(rq, &flags); 2454 task_rq_unlock(rq, &flags);
@@ -2448,7 +2495,6 @@ static void __sched_fork(struct task_struct *p)
2448 p->se.avg_overlap = 0; 2495 p->se.avg_overlap = 0;
2449 p->se.start_runtime = 0; 2496 p->se.start_runtime = 0;
2450 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2497 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2451 p->se.avg_running = 0;
2452 2498
2453#ifdef CONFIG_SCHEDSTATS 2499#ifdef CONFIG_SCHEDSTATS
2454 p->se.wait_start = 0; 2500 p->se.wait_start = 0;
@@ -2470,7 +2516,6 @@ static void __sched_fork(struct task_struct *p)
2470 p->se.nr_failed_migrations_running = 0; 2516 p->se.nr_failed_migrations_running = 0;
2471 p->se.nr_failed_migrations_hot = 0; 2517 p->se.nr_failed_migrations_hot = 0;
2472 p->se.nr_forced_migrations = 0; 2518 p->se.nr_forced_migrations = 0;
2473 p->se.nr_forced2_migrations = 0;
2474 2519
2475 p->se.nr_wakeups = 0; 2520 p->se.nr_wakeups = 0;
2476 p->se.nr_wakeups_sync = 0; 2521 p->se.nr_wakeups_sync = 0;
@@ -2511,22 +2556,17 @@ void sched_fork(struct task_struct *p, int clone_flags)
2511 __sched_fork(p); 2556 __sched_fork(p);
2512 2557
2513 /* 2558 /*
2514 * Make sure we do not leak PI boosting priority to the child.
2515 */
2516 p->prio = current->normal_prio;
2517
2518 /*
2519 * Revert to default priority/policy on fork if requested. 2559 * Revert to default priority/policy on fork if requested.
2520 */ 2560 */
2521 if (unlikely(p->sched_reset_on_fork)) { 2561 if (unlikely(p->sched_reset_on_fork)) {
2522 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) 2562 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2523 p->policy = SCHED_NORMAL; 2563 p->policy = SCHED_NORMAL;
2524 2564 p->normal_prio = p->static_prio;
2525 if (p->normal_prio < DEFAULT_PRIO) 2565 }
2526 p->prio = DEFAULT_PRIO;
2527 2566
2528 if (PRIO_TO_NICE(p->static_prio) < 0) { 2567 if (PRIO_TO_NICE(p->static_prio) < 0) {
2529 p->static_prio = NICE_TO_PRIO(0); 2568 p->static_prio = NICE_TO_PRIO(0);
2569 p->normal_prio = p->static_prio;
2530 set_load_weight(p); 2570 set_load_weight(p);
2531 } 2571 }
2532 2572
@@ -2537,11 +2577,19 @@ void sched_fork(struct task_struct *p, int clone_flags)
2537 p->sched_reset_on_fork = 0; 2577 p->sched_reset_on_fork = 0;
2538 } 2578 }
2539 2579
2580 /*
2581 * Make sure we do not leak PI boosting priority to the child.
2582 */
2583 p->prio = current->normal_prio;
2584
2540 if (!rt_prio(p->prio)) 2585 if (!rt_prio(p->prio))
2541 p->sched_class = &fair_sched_class; 2586 p->sched_class = &fair_sched_class;
2542 2587
2588 if (p->sched_class->task_fork)
2589 p->sched_class->task_fork(p);
2590
2543#ifdef CONFIG_SMP 2591#ifdef CONFIG_SMP
2544 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2592 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2545#endif 2593#endif
2546 set_task_cpu(p, cpu); 2594 set_task_cpu(p, cpu);
2547 2595
@@ -2576,19 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2576 rq = task_rq_lock(p, &flags); 2624 rq = task_rq_lock(p, &flags);
2577 BUG_ON(p->state != TASK_RUNNING); 2625 BUG_ON(p->state != TASK_RUNNING);
2578 update_rq_clock(rq); 2626 update_rq_clock(rq);
2579 2627 activate_task(rq, p, 0);
2580 p->prio = effective_prio(p);
2581
2582 if (!p->sched_class->task_new || !current->se.on_rq) {
2583 activate_task(rq, p, 0);
2584 } else {
2585 /*
2586 * Let the scheduling class do new task startup
2587 * management (if any):
2588 */
2589 p->sched_class->task_new(rq, p);
2590 inc_nr_running(rq);
2591 }
2592 trace_sched_wakeup_new(rq, p, 1); 2628 trace_sched_wakeup_new(rq, p, 1);
2593 check_preempt_curr(rq, p, WF_FORK); 2629 check_preempt_curr(rq, p, WF_FORK);
2594#ifdef CONFIG_SMP 2630#ifdef CONFIG_SMP
@@ -2812,14 +2848,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2812 */ 2848 */
2813 arch_start_context_switch(prev); 2849 arch_start_context_switch(prev);
2814 2850
2815 if (unlikely(!mm)) { 2851 if (likely(!mm)) {
2816 next->active_mm = oldmm; 2852 next->active_mm = oldmm;
2817 atomic_inc(&oldmm->mm_count); 2853 atomic_inc(&oldmm->mm_count);
2818 enter_lazy_tlb(oldmm, next); 2854 enter_lazy_tlb(oldmm, next);
2819 } else 2855 } else
2820 switch_mm(oldmm, mm, next); 2856 switch_mm(oldmm, mm, next);
2821 2857
2822 if (unlikely(!prev->mm)) { 2858 if (likely(!prev->mm)) {
2823 prev->active_mm = NULL; 2859 prev->active_mm = NULL;
2824 rq->prev_mm = oldmm; 2860 rq->prev_mm = oldmm;
2825 } 2861 }
@@ -2982,15 +3018,6 @@ static void calc_load_account_active(struct rq *this_rq)
2982} 3018}
2983 3019
2984/* 3020/*
2985 * Externally visible per-cpu scheduler statistics:
2986 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2987 */
2988u64 cpu_nr_migrations(int cpu)
2989{
2990 return cpu_rq(cpu)->nr_migrations_in;
2991}
2992
2993/*
2994 * Update rq->cpu_load[] statistics. This function is usually called every 3021 * Update rq->cpu_load[] statistics. This function is usually called every
2995 * scheduler tick (TICK_NSEC). 3022 * scheduler tick (TICK_NSEC).
2996 */ 3023 */
@@ -3112,7 +3139,7 @@ out:
3112void sched_exec(void) 3139void sched_exec(void)
3113{ 3140{
3114 int new_cpu, this_cpu = get_cpu(); 3141 int new_cpu, this_cpu = get_cpu();
3115 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); 3142 new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
3116 put_cpu(); 3143 put_cpu();
3117 if (new_cpu != this_cpu) 3144 if (new_cpu != this_cpu)
3118 sched_migrate_task(current, new_cpu); 3145 sched_migrate_task(current, new_cpu);
@@ -3128,10 +3155,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3128 deactivate_task(src_rq, p, 0); 3155 deactivate_task(src_rq, p, 0);
3129 set_task_cpu(p, this_cpu); 3156 set_task_cpu(p, this_cpu);
3130 activate_task(this_rq, p, 0); 3157 activate_task(this_rq, p, 0);
3131 /*
3132 * Note that idle threads have a prio of MAX_PRIO, for this test
3133 * to be always true for them.
3134 */
3135 check_preempt_curr(this_rq, p, 0); 3158 check_preempt_curr(this_rq, p, 0);
3136} 3159}
3137 3160
@@ -3654,6 +3677,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
3654 3677
3655/** 3678/**
3656 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3679 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3680 * @sd: The sched_domain whose statistics are to be updated.
3657 * @group: sched_group whose statistics are to be updated. 3681 * @group: sched_group whose statistics are to be updated.
3658 * @this_cpu: Cpu for which load balance is currently performed. 3682 * @this_cpu: Cpu for which load balance is currently performed.
3659 * @idle: Idle status of this_cpu 3683 * @idle: Idle status of this_cpu
@@ -4089,7 +4113,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4089 unsigned long flags; 4113 unsigned long flags;
4090 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4114 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4091 4115
4092 cpumask_setall(cpus); 4116 cpumask_copy(cpus, cpu_active_mask);
4093 4117
4094 /* 4118 /*
4095 * When power savings policy is enabled for the parent domain, idle 4119 * When power savings policy is enabled for the parent domain, idle
@@ -4252,7 +4276,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4252 int all_pinned = 0; 4276 int all_pinned = 0;
4253 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4277 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4254 4278
4255 cpumask_setall(cpus); 4279 cpumask_copy(cpus, cpu_active_mask);
4256 4280
4257 /* 4281 /*
4258 * When power savings policy is enabled for the parent domain, idle 4282 * When power savings policy is enabled for the parent domain, idle
@@ -4392,6 +4416,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4392 int pulled_task = 0; 4416 int pulled_task = 0;
4393 unsigned long next_balance = jiffies + HZ; 4417 unsigned long next_balance = jiffies + HZ;
4394 4418
4419 this_rq->idle_stamp = this_rq->clock;
4420
4421 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4422 return;
4423
4395 for_each_domain(this_cpu, sd) { 4424 for_each_domain(this_cpu, sd) {
4396 unsigned long interval; 4425 unsigned long interval;
4397 4426
@@ -4406,8 +4435,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4406 interval = msecs_to_jiffies(sd->balance_interval); 4435 interval = msecs_to_jiffies(sd->balance_interval);
4407 if (time_after(next_balance, sd->last_balance + interval)) 4436 if (time_after(next_balance, sd->last_balance + interval))
4408 next_balance = sd->last_balance + interval; 4437 next_balance = sd->last_balance + interval;
4409 if (pulled_task) 4438 if (pulled_task) {
4439 this_rq->idle_stamp = 0;
4410 break; 4440 break;
4441 }
4411 } 4442 }
4412 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4443 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4413 /* 4444 /*
@@ -4642,7 +4673,7 @@ int select_nohz_load_balancer(int stop_tick)
4642 cpumask_set_cpu(cpu, nohz.cpu_mask); 4673 cpumask_set_cpu(cpu, nohz.cpu_mask);
4643 4674
4644 /* time for ilb owner also to sleep */ 4675 /* time for ilb owner also to sleep */
4645 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4676 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4646 if (atomic_read(&nohz.load_balancer) == cpu) 4677 if (atomic_read(&nohz.load_balancer) == cpu)
4647 atomic_set(&nohz.load_balancer, -1); 4678 atomic_set(&nohz.load_balancer, -1);
4648 return 0; 4679 return 0;
@@ -5009,8 +5040,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5009 p->gtime = cputime_add(p->gtime, cputime); 5040 p->gtime = cputime_add(p->gtime, cputime);
5010 5041
5011 /* Add guest time to cpustat. */ 5042 /* Add guest time to cpustat. */
5012 cpustat->user = cputime64_add(cpustat->user, tmp); 5043 if (TASK_NICE(p) > 0) {
5013 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5044 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5045 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5046 } else {
5047 cpustat->user = cputime64_add(cpustat->user, tmp);
5048 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5049 }
5014} 5050}
5015 5051
5016/* 5052/*
@@ -5125,60 +5161,86 @@ void account_idle_ticks(unsigned long ticks)
5125 * Use precise platform statistics if available: 5161 * Use precise platform statistics if available:
5126 */ 5162 */
5127#ifdef CONFIG_VIRT_CPU_ACCOUNTING 5163#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5128cputime_t task_utime(struct task_struct *p) 5164void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5129{ 5165{
5130 return p->utime; 5166 *ut = p->utime;
5167 *st = p->stime;
5131} 5168}
5132 5169
5133cputime_t task_stime(struct task_struct *p) 5170void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5134{ 5171{
5135 return p->stime; 5172 struct task_cputime cputime;
5173
5174 thread_group_cputime(p, &cputime);
5175
5176 *ut = cputime.utime;
5177 *st = cputime.stime;
5136} 5178}
5137#else 5179#else
5138cputime_t task_utime(struct task_struct *p) 5180
5181#ifndef nsecs_to_cputime
5182# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
5183#endif
5184
5185void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5139{ 5186{
5140 clock_t utime = cputime_to_clock_t(p->utime), 5187 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5141 total = utime + cputime_to_clock_t(p->stime);
5142 u64 temp;
5143 5188
5144 /* 5189 /*
5145 * Use CFS's precise accounting: 5190 * Use CFS's precise accounting:
5146 */ 5191 */
5147 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5192 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5148 5193
5149 if (total) { 5194 if (total) {
5150 temp *= utime; 5195 u64 temp;
5196
5197 temp = (u64)(rtime * utime);
5151 do_div(temp, total); 5198 do_div(temp, total);
5152 } 5199 utime = (cputime_t)temp;
5153 utime = (clock_t)temp; 5200 } else
5201 utime = rtime;
5202
5203 /*
5204 * Compare with previous values, to keep monotonicity:
5205 */
5206 p->prev_utime = max(p->prev_utime, utime);
5207 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5154 5208
5155 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5209 *ut = p->prev_utime;
5156 return p->prev_utime; 5210 *st = p->prev_stime;
5157} 5211}
5158 5212
5159cputime_t task_stime(struct task_struct *p) 5213/*
5214 * Must be called with siglock held.
5215 */
5216void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5160{ 5217{
5161 clock_t stime; 5218 struct signal_struct *sig = p->signal;
5219 struct task_cputime cputime;
5220 cputime_t rtime, utime, total;
5162 5221
5163 /* 5222 thread_group_cputime(p, &cputime);
5164 * Use CFS's precise accounting. (we subtract utime from
5165 * the total, to make sure the total observed by userspace
5166 * grows monotonically - apps rely on that):
5167 */
5168 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5169 cputime_to_clock_t(task_utime(p));
5170 5223
5171 if (stime >= 0) 5224 total = cputime_add(cputime.utime, cputime.stime);
5172 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5225 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5173 5226
5174 return p->prev_stime; 5227 if (total) {
5175} 5228 u64 temp;
5176#endif
5177 5229
5178inline cputime_t task_gtime(struct task_struct *p) 5230 temp = (u64)(rtime * cputime.utime);
5179{ 5231 do_div(temp, total);
5180 return p->gtime; 5232 utime = (cputime_t)temp;
5233 } else
5234 utime = rtime;
5235
5236 sig->prev_utime = max(sig->prev_utime, utime);
5237 sig->prev_stime = max(sig->prev_stime,
5238 cputime_sub(rtime, sig->prev_utime));
5239
5240 *ut = sig->prev_utime;
5241 *st = sig->prev_stime;
5181} 5242}
5243#endif
5182 5244
5183/* 5245/*
5184 * This function gets called by the timer code, with HZ frequency. 5246 * This function gets called by the timer code, with HZ frequency.
@@ -5313,13 +5375,14 @@ static inline void schedule_debug(struct task_struct *prev)
5313#endif 5375#endif
5314} 5376}
5315 5377
5316static void put_prev_task(struct rq *rq, struct task_struct *p) 5378static void put_prev_task(struct rq *rq, struct task_struct *prev)
5317{ 5379{
5318 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5380 if (prev->state == TASK_RUNNING) {
5381 u64 runtime = prev->se.sum_exec_runtime;
5319 5382
5320 update_avg(&p->se.avg_running, runtime); 5383 runtime -= prev->se.prev_sum_exec_runtime;
5384 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5321 5385
5322 if (p->state == TASK_RUNNING) {
5323 /* 5386 /*
5324 * In order to avoid avg_overlap growing stale when we are 5387 * In order to avoid avg_overlap growing stale when we are
5325 * indeed overlapping and hence not getting put to sleep, grow 5388 * indeed overlapping and hence not getting put to sleep, grow
@@ -5329,12 +5392,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5329 * correlates to the amount of cache footprint a task can 5392 * correlates to the amount of cache footprint a task can
5330 * build up. 5393 * build up.
5331 */ 5394 */
5332 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5395 update_avg(&prev->se.avg_overlap, runtime);
5333 update_avg(&p->se.avg_overlap, runtime);
5334 } else {
5335 update_avg(&p->se.avg_running, 0);
5336 } 5396 }
5337 p->sched_class->put_prev_task(rq, p); 5397 prev->sched_class->put_prev_task(rq, prev);
5338} 5398}
5339 5399
5340/* 5400/*
@@ -5444,7 +5504,7 @@ need_resched_nonpreemptible:
5444} 5504}
5445EXPORT_SYMBOL(schedule); 5505EXPORT_SYMBOL(schedule);
5446 5506
5447#ifdef CONFIG_SMP 5507#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5448/* 5508/*
5449 * Look out! "owner" is an entirely speculative pointer 5509 * Look out! "owner" is an entirely speculative pointer
5450 * access and not reliable. 5510 * access and not reliable.
@@ -6138,22 +6198,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6138 BUG_ON(p->se.on_rq); 6198 BUG_ON(p->se.on_rq);
6139 6199
6140 p->policy = policy; 6200 p->policy = policy;
6141 switch (p->policy) {
6142 case SCHED_NORMAL:
6143 case SCHED_BATCH:
6144 case SCHED_IDLE:
6145 p->sched_class = &fair_sched_class;
6146 break;
6147 case SCHED_FIFO:
6148 case SCHED_RR:
6149 p->sched_class = &rt_sched_class;
6150 break;
6151 }
6152
6153 p->rt_priority = prio; 6201 p->rt_priority = prio;
6154 p->normal_prio = normal_prio(p); 6202 p->normal_prio = normal_prio(p);
6155 /* we are holding p->pi_lock already */ 6203 /* we are holding p->pi_lock already */
6156 p->prio = rt_mutex_getprio(p); 6204 p->prio = rt_mutex_getprio(p);
6205 if (rt_prio(p->prio))
6206 p->sched_class = &rt_sched_class;
6207 else
6208 p->sched_class = &fair_sched_class;
6157 set_load_weight(p); 6209 set_load_weight(p);
6158} 6210}
6159 6211
@@ -6556,6 +6608,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6556long sched_getaffinity(pid_t pid, struct cpumask *mask) 6608long sched_getaffinity(pid_t pid, struct cpumask *mask)
6557{ 6609{
6558 struct task_struct *p; 6610 struct task_struct *p;
6611 unsigned long flags;
6612 struct rq *rq;
6559 int retval; 6613 int retval;
6560 6614
6561 get_online_cpus(); 6615 get_online_cpus();
@@ -6570,7 +6624,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6570 if (retval) 6624 if (retval)
6571 goto out_unlock; 6625 goto out_unlock;
6572 6626
6627 rq = task_rq_lock(p, &flags);
6573 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6628 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6629 task_rq_unlock(rq, &flags);
6574 6630
6575out_unlock: 6631out_unlock:
6576 read_unlock(&tasklist_lock); 6632 read_unlock(&tasklist_lock);
@@ -6716,9 +6772,6 @@ EXPORT_SYMBOL(yield);
6716/* 6772/*
6717 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6773 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6718 * that process accounting knows that this is a task in IO wait state. 6774 * that process accounting knows that this is a task in IO wait state.
6719 *
6720 * But don't do that if it is a deliberate, throttling IO wait (this task
6721 * has set its backing_dev_info: the queue against which it should throttle)
6722 */ 6775 */
6723void __sched io_schedule(void) 6776void __sched io_schedule(void)
6724{ 6777{
@@ -6811,6 +6864,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6811{ 6864{
6812 struct task_struct *p; 6865 struct task_struct *p;
6813 unsigned int time_slice; 6866 unsigned int time_slice;
6867 unsigned long flags;
6868 struct rq *rq;
6814 int retval; 6869 int retval;
6815 struct timespec t; 6870 struct timespec t;
6816 6871
@@ -6827,7 +6882,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6827 if (retval) 6882 if (retval)
6828 goto out_unlock; 6883 goto out_unlock;
6829 6884
6830 time_slice = p->sched_class->get_rr_interval(p); 6885 rq = task_rq_lock(p, &flags);
6886 time_slice = p->sched_class->get_rr_interval(rq, p);
6887 task_rq_unlock(rq, &flags);
6831 6888
6832 read_unlock(&tasklist_lock); 6889 read_unlock(&tasklist_lock);
6833 jiffies_to_timespec(time_slice, &t); 6890 jiffies_to_timespec(time_slice, &t);
@@ -6901,7 +6958,7 @@ void show_state_filter(unsigned long state_filter)
6901 /* 6958 /*
6902 * Only show locks if all tasks are dumped: 6959 * Only show locks if all tasks are dumped:
6903 */ 6960 */
6904 if (state_filter == -1) 6961 if (!state_filter)
6905 debug_show_all_locks(); 6962 debug_show_all_locks();
6906} 6963}
6907 6964
@@ -6928,7 +6985,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6928 __sched_fork(idle); 6985 __sched_fork(idle);
6929 idle->se.exec_start = sched_clock(); 6986 idle->se.exec_start = sched_clock();
6930 6987
6931 idle->prio = idle->normal_prio = MAX_PRIO;
6932 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6988 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
6933 __set_task_cpu(idle, cpu); 6989 __set_task_cpu(idle, cpu);
6934 6990
@@ -6969,22 +7025,43 @@ cpumask_var_t nohz_cpu_mask;
6969 * 7025 *
6970 * This idea comes from the SD scheduler of Con Kolivas: 7026 * This idea comes from the SD scheduler of Con Kolivas:
6971 */ 7027 */
6972static inline void sched_init_granularity(void) 7028static int get_update_sysctl_factor(void)
6973{ 7029{
6974 unsigned int factor = 1 + ilog2(num_online_cpus()); 7030 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6975 const unsigned long limit = 200000000; 7031 unsigned int factor;
7032
7033 switch (sysctl_sched_tunable_scaling) {
7034 case SCHED_TUNABLESCALING_NONE:
7035 factor = 1;
7036 break;
7037 case SCHED_TUNABLESCALING_LINEAR:
7038 factor = cpus;
7039 break;
7040 case SCHED_TUNABLESCALING_LOG:
7041 default:
7042 factor = 1 + ilog2(cpus);
7043 break;
7044 }
6976 7045
6977 sysctl_sched_min_granularity *= factor; 7046 return factor;
6978 if (sysctl_sched_min_granularity > limit) 7047}
6979 sysctl_sched_min_granularity = limit;
6980 7048
6981 sysctl_sched_latency *= factor; 7049static void update_sysctl(void)
6982 if (sysctl_sched_latency > limit) 7050{
6983 sysctl_sched_latency = limit; 7051 unsigned int factor = get_update_sysctl_factor();
6984 7052
6985 sysctl_sched_wakeup_granularity *= factor; 7053#define SET_SYSCTL(name) \
7054 (sysctl_##name = (factor) * normalized_sysctl_##name)
7055 SET_SYSCTL(sched_min_granularity);
7056 SET_SYSCTL(sched_latency);
7057 SET_SYSCTL(sched_wakeup_granularity);
7058 SET_SYSCTL(sched_shares_ratelimit);
7059#undef SET_SYSCTL
7060}
6986 7061
6987 sysctl_sched_shares_ratelimit *= factor; 7062static inline void sched_init_granularity(void)
7063{
7064 update_sysctl();
6988} 7065}
6989 7066
6990#ifdef CONFIG_SMP 7067#ifdef CONFIG_SMP
@@ -7021,7 +7098,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7021 int ret = 0; 7098 int ret = 0;
7022 7099
7023 rq = task_rq_lock(p, &flags); 7100 rq = task_rq_lock(p, &flags);
7024 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7101 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7025 ret = -EINVAL; 7102 ret = -EINVAL;
7026 goto out; 7103 goto out;
7027 } 7104 }
@@ -7043,7 +7120,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7043 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7120 if (cpumask_test_cpu(task_cpu(p), new_mask))
7044 goto out; 7121 goto out;
7045 7122
7046 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7123 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7047 /* Need help from migration thread: drop lock and wait. */ 7124 /* Need help from migration thread: drop lock and wait. */
7048 struct task_struct *mt = rq->migration_thread; 7125 struct task_struct *mt = rq->migration_thread;
7049 7126
@@ -7197,19 +7274,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7197 7274
7198again: 7275again:
7199 /* Look for allowed, online CPU in same node. */ 7276 /* Look for allowed, online CPU in same node. */
7200 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) 7277 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
7201 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7278 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7202 goto move; 7279 goto move;
7203 7280
7204 /* Any allowed, online CPU? */ 7281 /* Any allowed, online CPU? */
7205 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); 7282 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
7206 if (dest_cpu < nr_cpu_ids) 7283 if (dest_cpu < nr_cpu_ids)
7207 goto move; 7284 goto move;
7208 7285
7209 /* No more Mr. Nice Guy. */ 7286 /* No more Mr. Nice Guy. */
7210 if (dest_cpu >= nr_cpu_ids) { 7287 if (dest_cpu >= nr_cpu_ids) {
7211 cpuset_cpus_allowed_locked(p, &p->cpus_allowed); 7288 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7212 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); 7289 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
7213 7290
7214 /* 7291 /*
7215 * Don't tell them about moving exiting tasks or 7292 * Don't tell them about moving exiting tasks or
@@ -7238,7 +7315,7 @@ move:
7238 */ 7315 */
7239static void migrate_nr_uninterruptible(struct rq *rq_src) 7316static void migrate_nr_uninterruptible(struct rq *rq_src)
7240{ 7317{
7241 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7318 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7242 unsigned long flags; 7319 unsigned long flags;
7243 7320
7244 local_irq_save(flags); 7321 local_irq_save(flags);
@@ -7372,17 +7449,16 @@ static struct ctl_table sd_ctl_dir[] = {
7372 .procname = "sched_domain", 7449 .procname = "sched_domain",
7373 .mode = 0555, 7450 .mode = 0555,
7374 }, 7451 },
7375 {0, }, 7452 {}
7376}; 7453};
7377 7454
7378static struct ctl_table sd_ctl_root[] = { 7455static struct ctl_table sd_ctl_root[] = {
7379 { 7456 {
7380 .ctl_name = CTL_KERN,
7381 .procname = "kernel", 7457 .procname = "kernel",
7382 .mode = 0555, 7458 .mode = 0555,
7383 .child = sd_ctl_dir, 7459 .child = sd_ctl_dir,
7384 }, 7460 },
7385 {0, }, 7461 {}
7386}; 7462};
7387 7463
7388static struct ctl_table *sd_alloc_ctl_entry(int n) 7464static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7492,7 +7568,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7492static struct ctl_table_header *sd_sysctl_header; 7568static struct ctl_table_header *sd_sysctl_header;
7493static void register_sched_domain_sysctl(void) 7569static void register_sched_domain_sysctl(void)
7494{ 7570{
7495 int i, cpu_num = num_online_cpus(); 7571 int i, cpu_num = num_possible_cpus();
7496 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7572 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7497 char buf[32]; 7573 char buf[32];
7498 7574
@@ -7502,7 +7578,7 @@ static void register_sched_domain_sysctl(void)
7502 if (entry == NULL) 7578 if (entry == NULL)
7503 return; 7579 return;
7504 7580
7505 for_each_online_cpu(i) { 7581 for_each_possible_cpu(i) {
7506 snprintf(buf, 32, "cpu%d", i); 7582 snprintf(buf, 32, "cpu%d", i);
7507 entry->procname = kstrdup(buf, GFP_KERNEL); 7583 entry->procname = kstrdup(buf, GFP_KERNEL);
7508 entry->mode = 0555; 7584 entry->mode = 0555;
@@ -7632,7 +7708,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7632 spin_lock_irq(&rq->lock); 7708 spin_lock_irq(&rq->lock);
7633 update_rq_clock(rq); 7709 update_rq_clock(rq);
7634 deactivate_task(rq, rq->idle, 0); 7710 deactivate_task(rq, rq->idle, 0);
7635 rq->idle->static_prio = MAX_PRIO;
7636 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7711 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7637 rq->idle->sched_class = &idle_sched_class; 7712 rq->idle->sched_class = &idle_sched_class;
7638 migrate_dead_tasks(cpu); 7713 migrate_dead_tasks(cpu);
@@ -7706,6 +7781,16 @@ early_initcall(migration_init);
7706 7781
7707#ifdef CONFIG_SCHED_DEBUG 7782#ifdef CONFIG_SCHED_DEBUG
7708 7783
7784static __read_mostly int sched_domain_debug_enabled;
7785
7786static int __init sched_domain_debug_setup(char *str)
7787{
7788 sched_domain_debug_enabled = 1;
7789
7790 return 0;
7791}
7792early_param("sched_debug", sched_domain_debug_setup);
7793
7709static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7794static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7710 struct cpumask *groupmask) 7795 struct cpumask *groupmask)
7711{ 7796{
@@ -7792,6 +7877,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7792 cpumask_var_t groupmask; 7877 cpumask_var_t groupmask;
7793 int level = 0; 7878 int level = 0;
7794 7879
7880 if (!sched_domain_debug_enabled)
7881 return;
7882
7795 if (!sd) { 7883 if (!sd) {
7796 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7884 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7797 return; 7885 return;
@@ -7871,6 +7959,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7871 7959
7872static void free_rootdomain(struct root_domain *rd) 7960static void free_rootdomain(struct root_domain *rd)
7873{ 7961{
7962 synchronize_sched();
7963
7874 cpupri_cleanup(&rd->cpupri); 7964 cpupri_cleanup(&rd->cpupri);
7875 7965
7876 free_cpumask_var(rd->rto_mask); 7966 free_cpumask_var(rd->rto_mask);
@@ -8011,6 +8101,7 @@ static cpumask_var_t cpu_isolated_map;
8011/* Setup the mask of cpus configured for isolated domains */ 8101/* Setup the mask of cpus configured for isolated domains */
8012static int __init isolated_cpu_setup(char *str) 8102static int __init isolated_cpu_setup(char *str)
8013{ 8103{
8104 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8014 cpulist_parse(str, cpu_isolated_map); 8105 cpulist_parse(str, cpu_isolated_map);
8015 return 1; 8106 return 1;
8016} 8107}
@@ -8847,7 +8938,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8847 return __build_sched_domains(cpu_map, NULL); 8938 return __build_sched_domains(cpu_map, NULL);
8848} 8939}
8849 8940
8850static struct cpumask *doms_cur; /* current sched domains */ 8941static cpumask_var_t *doms_cur; /* current sched domains */
8851static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8942static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8852static struct sched_domain_attr *dattr_cur; 8943static struct sched_domain_attr *dattr_cur;
8853 /* attribues of custom domains in 'doms_cur' */ 8944 /* attribues of custom domains in 'doms_cur' */
@@ -8869,6 +8960,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8869 return 0; 8960 return 0;
8870} 8961}
8871 8962
8963cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
8964{
8965 int i;
8966 cpumask_var_t *doms;
8967
8968 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
8969 if (!doms)
8970 return NULL;
8971 for (i = 0; i < ndoms; i++) {
8972 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
8973 free_sched_domains(doms, i);
8974 return NULL;
8975 }
8976 }
8977 return doms;
8978}
8979
8980void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
8981{
8982 unsigned int i;
8983 for (i = 0; i < ndoms; i++)
8984 free_cpumask_var(doms[i]);
8985 kfree(doms);
8986}
8987
8872/* 8988/*
8873 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 8989 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8874 * For now this just excludes isolated cpus, but could be used to 8990 * For now this just excludes isolated cpus, but could be used to
@@ -8880,12 +8996,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
8880 8996
8881 arch_update_cpu_topology(); 8997 arch_update_cpu_topology();
8882 ndoms_cur = 1; 8998 ndoms_cur = 1;
8883 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 8999 doms_cur = alloc_sched_domains(ndoms_cur);
8884 if (!doms_cur) 9000 if (!doms_cur)
8885 doms_cur = fallback_doms; 9001 doms_cur = &fallback_doms;
8886 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 9002 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8887 dattr_cur = NULL; 9003 dattr_cur = NULL;
8888 err = build_sched_domains(doms_cur); 9004 err = build_sched_domains(doms_cur[0]);
8889 register_sched_domain_sysctl(); 9005 register_sched_domain_sysctl();
8890 9006
8891 return err; 9007 return err;
@@ -8935,19 +9051,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8935 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9051 * doms_new[] to the current sched domain partitioning, doms_cur[].
8936 * It destroys each deleted domain and builds each new domain. 9052 * It destroys each deleted domain and builds each new domain.
8937 * 9053 *
8938 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9054 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
8939 * The masks don't intersect (don't overlap.) We should setup one 9055 * The masks don't intersect (don't overlap.) We should setup one
8940 * sched domain for each mask. CPUs not in any of the cpumasks will 9056 * sched domain for each mask. CPUs not in any of the cpumasks will
8941 * not be load balanced. If the same cpumask appears both in the 9057 * not be load balanced. If the same cpumask appears both in the
8942 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9058 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8943 * it as it is. 9059 * it as it is.
8944 * 9060 *
8945 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9061 * The passed in 'doms_new' should be allocated using
8946 * ownership of it and will kfree it when done with it. If the caller 9062 * alloc_sched_domains. This routine takes ownership of it and will
8947 * failed the kmalloc call, then it can pass in doms_new == NULL && 9063 * free_sched_domains it when done with it. If the caller failed the
8948 * ndoms_new == 1, and partition_sched_domains() will fallback to 9064 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
8949 * the single partition 'fallback_doms', it also forces the domains 9065 * and partition_sched_domains() will fallback to the single partition
8950 * to be rebuilt. 9066 * 'fallback_doms', it also forces the domains to be rebuilt.
8951 * 9067 *
8952 * If doms_new == NULL it will be replaced with cpu_online_mask. 9068 * If doms_new == NULL it will be replaced with cpu_online_mask.
8953 * ndoms_new == 0 is a special case for destroying existing domains, 9069 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8955,8 +9071,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8955 * 9071 *
8956 * Call with hotplug lock held 9072 * Call with hotplug lock held
8957 */ 9073 */
8958/* FIXME: Change to struct cpumask *doms_new[] */ 9074void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
8959void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8960 struct sched_domain_attr *dattr_new) 9075 struct sched_domain_attr *dattr_new)
8961{ 9076{
8962 int i, j, n; 9077 int i, j, n;
@@ -8975,40 +9090,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8975 /* Destroy deleted domains */ 9090 /* Destroy deleted domains */
8976 for (i = 0; i < ndoms_cur; i++) { 9091 for (i = 0; i < ndoms_cur; i++) {
8977 for (j = 0; j < n && !new_topology; j++) { 9092 for (j = 0; j < n && !new_topology; j++) {
8978 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9093 if (cpumask_equal(doms_cur[i], doms_new[j])
8979 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9094 && dattrs_equal(dattr_cur, i, dattr_new, j))
8980 goto match1; 9095 goto match1;
8981 } 9096 }
8982 /* no match - a current sched domain not in new doms_new[] */ 9097 /* no match - a current sched domain not in new doms_new[] */
8983 detach_destroy_domains(doms_cur + i); 9098 detach_destroy_domains(doms_cur[i]);
8984match1: 9099match1:
8985 ; 9100 ;
8986 } 9101 }
8987 9102
8988 if (doms_new == NULL) { 9103 if (doms_new == NULL) {
8989 ndoms_cur = 0; 9104 ndoms_cur = 0;
8990 doms_new = fallback_doms; 9105 doms_new = &fallback_doms;
8991 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9106 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
8992 WARN_ON_ONCE(dattr_new); 9107 WARN_ON_ONCE(dattr_new);
8993 } 9108 }
8994 9109
8995 /* Build new domains */ 9110 /* Build new domains */
8996 for (i = 0; i < ndoms_new; i++) { 9111 for (i = 0; i < ndoms_new; i++) {
8997 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9112 for (j = 0; j < ndoms_cur && !new_topology; j++) {
8998 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9113 if (cpumask_equal(doms_new[i], doms_cur[j])
8999 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9114 && dattrs_equal(dattr_new, i, dattr_cur, j))
9000 goto match2; 9115 goto match2;
9001 } 9116 }
9002 /* no match - add a new doms_new */ 9117 /* no match - add a new doms_new */
9003 __build_sched_domains(doms_new + i, 9118 __build_sched_domains(doms_new[i],
9004 dattr_new ? dattr_new + i : NULL); 9119 dattr_new ? dattr_new + i : NULL);
9005match2: 9120match2:
9006 ; 9121 ;
9007 } 9122 }
9008 9123
9009 /* Remember the new sched domains */ 9124 /* Remember the new sched domains */
9010 if (doms_cur != fallback_doms) 9125 if (doms_cur != &fallback_doms)
9011 kfree(doms_cur); 9126 free_sched_domains(doms_cur, ndoms_cur);
9012 kfree(dattr_cur); /* kfree(NULL) is safe */ 9127 kfree(dattr_cur); /* kfree(NULL) is safe */
9013 doms_cur = doms_new; 9128 doms_cur = doms_new;
9014 dattr_cur = dattr_new; 9129 dattr_cur = dattr_new;
@@ -9119,8 +9234,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9119 switch (action) { 9234 switch (action) {
9120 case CPU_ONLINE: 9235 case CPU_ONLINE:
9121 case CPU_ONLINE_FROZEN: 9236 case CPU_ONLINE_FROZEN:
9122 case CPU_DEAD: 9237 case CPU_DOWN_PREPARE:
9123 case CPU_DEAD_FROZEN: 9238 case CPU_DOWN_PREPARE_FROZEN:
9239 case CPU_DOWN_FAILED:
9240 case CPU_DOWN_FAILED_FROZEN:
9124 partition_sched_domains(1, NULL, NULL); 9241 partition_sched_domains(1, NULL, NULL);
9125 return NOTIFY_OK; 9242 return NOTIFY_OK;
9126 9243
@@ -9167,7 +9284,7 @@ void __init sched_init_smp(void)
9167#endif 9284#endif
9168 get_online_cpus(); 9285 get_online_cpus();
9169 mutex_lock(&sched_domains_mutex); 9286 mutex_lock(&sched_domains_mutex);
9170 arch_init_sched_domains(cpu_online_mask); 9287 arch_init_sched_domains(cpu_active_mask);
9171 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9288 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9172 if (cpumask_empty(non_isolated_cpus)) 9289 if (cpumask_empty(non_isolated_cpus))
9173 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9290 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9330,10 +9447,6 @@ void __init sched_init(void)
9330#ifdef CONFIG_CPUMASK_OFFSTACK 9447#ifdef CONFIG_CPUMASK_OFFSTACK
9331 alloc_size += num_possible_cpus() * cpumask_size(); 9448 alloc_size += num_possible_cpus() * cpumask_size();
9332#endif 9449#endif
9333 /*
9334 * As sched_init() is called before page_alloc is setup,
9335 * we use alloc_bootmem().
9336 */
9337 if (alloc_size) { 9450 if (alloc_size) {
9338 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9451 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9339 9452
@@ -9488,6 +9601,8 @@ void __init sched_init(void)
9488 rq->cpu = i; 9601 rq->cpu = i;
9489 rq->online = 0; 9602 rq->online = 0;
9490 rq->migration_thread = NULL; 9603 rq->migration_thread = NULL;
9604 rq->idle_stamp = 0;
9605 rq->avg_idle = 2*sysctl_sched_migration_cost;
9491 INIT_LIST_HEAD(&rq->migration_queue); 9606 INIT_LIST_HEAD(&rq->migration_queue);
9492 rq_attach_root(rq, &def_root_domain); 9607 rq_attach_root(rq, &def_root_domain);
9493#endif 9608#endif
@@ -9531,13 +9646,15 @@ void __init sched_init(void)
9531 current->sched_class = &fair_sched_class; 9646 current->sched_class = &fair_sched_class;
9532 9647
9533 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9648 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9534 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 9649 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9535#ifdef CONFIG_SMP 9650#ifdef CONFIG_SMP
9536#ifdef CONFIG_NO_HZ 9651#ifdef CONFIG_NO_HZ
9537 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9652 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9538 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9653 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9539#endif 9654#endif
9540 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9655 /* May be allocated at isolcpus cmdline parse time */
9656 if (cpu_isolated_map == NULL)
9657 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9541#endif /* SMP */ 9658#endif /* SMP */
9542 9659
9543 perf_event_init(); 9660 perf_event_init();
@@ -9731,13 +9848,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9731 se = kzalloc_node(sizeof(struct sched_entity), 9848 se = kzalloc_node(sizeof(struct sched_entity),
9732 GFP_KERNEL, cpu_to_node(i)); 9849 GFP_KERNEL, cpu_to_node(i));
9733 if (!se) 9850 if (!se)
9734 goto err; 9851 goto err_free_rq;
9735 9852
9736 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9853 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9737 } 9854 }
9738 9855
9739 return 1; 9856 return 1;
9740 9857
9858 err_free_rq:
9859 kfree(cfs_rq);
9741 err: 9860 err:
9742 return 0; 9861 return 0;
9743} 9862}
@@ -9819,13 +9938,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9819 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9938 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9820 GFP_KERNEL, cpu_to_node(i)); 9939 GFP_KERNEL, cpu_to_node(i));
9821 if (!rt_se) 9940 if (!rt_se)
9822 goto err; 9941 goto err_free_rq;
9823 9942
9824 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9943 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9825 } 9944 }
9826 9945
9827 return 1; 9946 return 1;
9828 9947
9948 err_free_rq:
9949 kfree(rt_rq);
9829 err: 9950 err:
9830 return 0; 9951 return 0;
9831} 9952}
@@ -10867,6 +10988,7 @@ void synchronize_sched_expedited(void)
10867 spin_unlock_irqrestore(&rq->lock, flags); 10988 spin_unlock_irqrestore(&rq->lock, flags);
10868 } 10989 }
10869 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10990 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10991 synchronize_sched_expedited_count++;
10870 mutex_unlock(&rcu_sched_expedited_mutex); 10992 mutex_unlock(&rcu_sched_expedited_mutex);
10871 put_online_cpus(); 10993 put_online_cpus();
10872 if (need_full_sync) 10994 if (need_full_sync)