aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2009-09-14 00:16:56 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2009-09-14 00:16:56 -0400
commitfc8e1ead9314cf0e0f1922e661428b93d3a50d88 (patch)
treef3cb97c4769b74f6627a59769f1ed5c92a13c58a /kernel/sched.c
parent2bcaa6a4238094c5695d5b1943078388d82d3004 (diff)
parent9de48cc300fb10f7d9faa978670becf5e352462a (diff)
Merge branch 'next' into for-linus
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c530
1 files changed, 436 insertions, 94 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..1b59e265273b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h>
42#include <linux/security.h> 43#include <linux/security.h>
43#include <linux/notifier.h> 44#include <linux/notifier.h>
44#include <linux/profile.h> 45#include <linux/profile.h>
@@ -68,17 +69,18 @@
68#include <linux/pagemap.h> 69#include <linux/pagemap.h>
69#include <linux/hrtimer.h> 70#include <linux/hrtimer.h>
70#include <linux/tick.h> 71#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h> 72#include <linux/debugfs.h>
73#include <linux/ctype.h> 73#include <linux/ctype.h>
74#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
76 75
77#include <asm/tlb.h> 76#include <asm/tlb.h>
78#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
79 78
80#include "sched_cpupri.h" 79#include "sched_cpupri.h"
81 80
81#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h>
83
82/* 84/*
83 * Convert user-nice values [ -20 ... 0 ... 19 ] 85 * Convert user-nice values [ -20 ... 0 ... 19 ]
84 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 86 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
118 */ 120 */
119#define RUNTIME_INF ((u64)~0ULL) 121#define RUNTIME_INF ((u64)~0ULL)
120 122
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
127#ifdef CONFIG_SMP 123#ifdef CONFIG_SMP
128 124
129static void double_rq_lock(struct rq *rq1, struct rq *rq2); 125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -244,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 240 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft)); 241 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 242 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0); 243 HRTIMER_MODE_ABS_PINNED, 0);
248 } 244 }
249 spin_unlock(&rt_b->rt_runtime_lock); 245 spin_unlock(&rt_b->rt_runtime_lock);
250} 246}
@@ -497,6 +493,7 @@ struct rt_rq {
497#endif 493#endif
498#ifdef CONFIG_SMP 494#ifdef CONFIG_SMP
499 unsigned long rt_nr_migratory; 495 unsigned long rt_nr_migratory;
496 unsigned long rt_nr_total;
500 int overloaded; 497 int overloaded;
501 struct plist_head pushable_tasks; 498 struct plist_head pushable_tasks;
502#endif 499#endif
@@ -584,6 +581,7 @@ struct rq {
584 struct load_weight load; 581 struct load_weight load;
585 unsigned long nr_load_updates; 582 unsigned long nr_load_updates;
586 u64 nr_switches; 583 u64 nr_switches;
584 u64 nr_migrations_in;
587 585
588 struct cfs_rq cfs; 586 struct cfs_rq cfs;
589 struct rt_rq rt; 587 struct rt_rq rt;
@@ -630,6 +628,10 @@ struct rq {
630 struct list_head migration_queue; 628 struct list_head migration_queue;
631#endif 629#endif
632 630
631 /* calc_load related fields */
632 unsigned long calc_load_update;
633 long calc_load_active;
634
633#ifdef CONFIG_SCHED_HRTICK 635#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 636#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 637 int hrtick_csd_pending;
@@ -692,7 +694,7 @@ static inline int cpu_of(struct rq *rq)
692#define task_rq(p) cpu_rq(task_cpu(p)) 694#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 695#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694 696
695static inline void update_rq_clock(struct rq *rq) 697inline void update_rq_clock(struct rq *rq)
696{ 698{
697 rq->clock = sched_clock_cpu(cpu_of(rq)); 699 rq->clock = sched_clock_cpu(cpu_of(rq));
698} 700}
@@ -1154,7 +1156,7 @@ static __init void init_hrtick(void)
1154static void hrtick_start(struct rq *rq, u64 delay) 1156static void hrtick_start(struct rq *rq, u64 delay)
1155{ 1157{
1156 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 1158 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1157 HRTIMER_MODE_REL, 0); 1159 HRTIMER_MODE_REL_PINNED, 0);
1158} 1160}
1159 1161
1160static inline void init_hrtick(void) 1162static inline void init_hrtick(void)
@@ -1728,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1730}
1729#endif 1731#endif
1730 1732
1733static void calc_load_account_active(struct rq *this_rq);
1734
1731#include "sched_stats.h" 1735#include "sched_stats.h"
1732#include "sched_idletask.c" 1736#include "sched_idletask.c"
1733#include "sched_fair.c" 1737#include "sched_fair.c"
@@ -1958,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1958 1962
1959 clock_offset = old_rq->clock - new_rq->clock; 1963 clock_offset = old_rq->clock - new_rq->clock;
1960 1964
1961 trace_sched_migrate_task(p, task_cpu(p), new_cpu); 1965 trace_sched_migrate_task(p, new_cpu);
1962 1966
1963#ifdef CONFIG_SCHEDSTATS 1967#ifdef CONFIG_SCHEDSTATS
1964 if (p->se.wait_start) 1968 if (p->se.wait_start)
@@ -1967,12 +1971,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1967 p->se.sleep_start -= clock_offset; 1971 p->se.sleep_start -= clock_offset;
1968 if (p->se.block_start) 1972 if (p->se.block_start)
1969 p->se.block_start -= clock_offset; 1973 p->se.block_start -= clock_offset;
1974#endif
1970 if (old_cpu != new_cpu) { 1975 if (old_cpu != new_cpu) {
1971 schedstat_inc(p, se.nr_migrations); 1976 p->se.nr_migrations++;
1977 new_rq->nr_migrations_in++;
1978#ifdef CONFIG_SCHEDSTATS
1972 if (task_hot(p, old_rq->clock, NULL)) 1979 if (task_hot(p, old_rq->clock, NULL))
1973 schedstat_inc(p, se.nr_forced2_migrations); 1980 schedstat_inc(p, se.nr_forced2_migrations);
1974 }
1975#endif 1981#endif
1982 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1983 1, 1, NULL, 0);
1984 }
1976 p->se.vruntime -= old_cfsrq->min_vruntime - 1985 p->se.vruntime -= old_cfsrq->min_vruntime -
1977 new_cfsrq->min_vruntime; 1986 new_cfsrq->min_vruntime;
1978 1987
@@ -2015,6 +2024,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2015} 2024}
2016 2025
2017/* 2026/*
2027 * wait_task_context_switch - wait for a thread to complete at least one
2028 * context switch.
2029 *
2030 * @p must not be current.
2031 */
2032void wait_task_context_switch(struct task_struct *p)
2033{
2034 unsigned long nvcsw, nivcsw, flags;
2035 int running;
2036 struct rq *rq;
2037
2038 nvcsw = p->nvcsw;
2039 nivcsw = p->nivcsw;
2040 for (;;) {
2041 /*
2042 * The runqueue is assigned before the actual context
2043 * switch. We need to take the runqueue lock.
2044 *
2045 * We could check initially without the lock but it is
2046 * very likely that we need to take the lock in every
2047 * iteration.
2048 */
2049 rq = task_rq_lock(p, &flags);
2050 running = task_running(rq, p);
2051 task_rq_unlock(rq, &flags);
2052
2053 if (likely(!running))
2054 break;
2055 /*
2056 * The switch count is incremented before the actual
2057 * context switch. We thus wait for two switches to be
2058 * sure at least one completed.
2059 */
2060 if ((p->nvcsw - nvcsw) > 1)
2061 break;
2062 if ((p->nivcsw - nivcsw) > 1)
2063 break;
2064
2065 cpu_relax();
2066 }
2067}
2068
2069/*
2018 * wait_task_inactive - wait for a thread to unschedule. 2070 * wait_task_inactive - wait for a thread to unschedule.
2019 * 2071 *
2020 * If @match_state is nonzero, it's the @p->state value just checked and 2072 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2142,6 +2194,7 @@ void kick_process(struct task_struct *p)
2142 smp_send_reschedule(cpu); 2194 smp_send_reschedule(cpu);
2143 preempt_enable(); 2195 preempt_enable();
2144} 2196}
2197EXPORT_SYMBOL_GPL(kick_process);
2145 2198
2146/* 2199/*
2147 * Return a low guess at the load of a migration-source cpu weighted 2200 * Return a low guess at the load of a migration-source cpu weighted
@@ -2324,6 +2377,27 @@ static int sched_balance_self(int cpu, int flag)
2324 2377
2325#endif /* CONFIG_SMP */ 2378#endif /* CONFIG_SMP */
2326 2379
2380/**
2381 * task_oncpu_function_call - call a function on the cpu on which a task runs
2382 * @p: the task to evaluate
2383 * @func: the function to be called
2384 * @info: the function call argument
2385 *
2386 * Calls the function @func when the task is currently running. This might
2387 * be on the current CPU, which just calls the function directly
2388 */
2389void task_oncpu_function_call(struct task_struct *p,
2390 void (*func) (void *info), void *info)
2391{
2392 int cpu;
2393
2394 preempt_disable();
2395 cpu = task_cpu(p);
2396 if (task_curr(p))
2397 smp_call_function_single(cpu, func, info, 1);
2398 preempt_enable();
2399}
2400
2327/*** 2401/***
2328 * try_to_wake_up - wake up a thread 2402 * try_to_wake_up - wake up a thread
2329 * @p: the to-be-woken-up thread 2403 * @p: the to-be-woken-up thread
@@ -2458,6 +2532,17 @@ out:
2458 return success; 2532 return success;
2459} 2533}
2460 2534
2535/**
2536 * wake_up_process - Wake up a specific process
2537 * @p: The process to be woken up.
2538 *
2539 * Attempt to wake up the nominated process and move it to the set of runnable
2540 * processes. Returns 1 if the process was woken up, 0 if it was already
2541 * running.
2542 *
2543 * It may be assumed that this function implies a write memory barrier before
2544 * changing the task state if and only if any tasks are woken up.
2545 */
2461int wake_up_process(struct task_struct *p) 2546int wake_up_process(struct task_struct *p)
2462{ 2547{
2463 return try_to_wake_up(p, TASK_ALL, 0); 2548 return try_to_wake_up(p, TASK_ALL, 0);
@@ -2480,21 +2565,44 @@ static void __sched_fork(struct task_struct *p)
2480 p->se.exec_start = 0; 2565 p->se.exec_start = 0;
2481 p->se.sum_exec_runtime = 0; 2566 p->se.sum_exec_runtime = 0;
2482 p->se.prev_sum_exec_runtime = 0; 2567 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0;
2483 p->se.last_wakeup = 0; 2569 p->se.last_wakeup = 0;
2484 p->se.avg_overlap = 0; 2570 p->se.avg_overlap = 0;
2485 p->se.start_runtime = 0; 2571 p->se.start_runtime = 0;
2486 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2487 2573
2488#ifdef CONFIG_SCHEDSTATS 2574#ifdef CONFIG_SCHEDSTATS
2489 p->se.wait_start = 0; 2575 p->se.wait_start = 0;
2490 p->se.sum_sleep_runtime = 0; 2576 p->se.wait_max = 0;
2491 p->se.sleep_start = 0; 2577 p->se.wait_count = 0;
2492 p->se.block_start = 0; 2578 p->se.wait_sum = 0;
2493 p->se.sleep_max = 0; 2579
2494 p->se.block_max = 0; 2580 p->se.sleep_start = 0;
2495 p->se.exec_max = 0; 2581 p->se.sleep_max = 0;
2496 p->se.slice_max = 0; 2582 p->se.sum_sleep_runtime = 0;
2497 p->se.wait_max = 0; 2583
2584 p->se.block_start = 0;
2585 p->se.block_max = 0;
2586 p->se.exec_max = 0;
2587 p->se.slice_max = 0;
2588
2589 p->se.nr_migrations_cold = 0;
2590 p->se.nr_failed_migrations_affine = 0;
2591 p->se.nr_failed_migrations_running = 0;
2592 p->se.nr_failed_migrations_hot = 0;
2593 p->se.nr_forced_migrations = 0;
2594 p->se.nr_forced2_migrations = 0;
2595
2596 p->se.nr_wakeups = 0;
2597 p->se.nr_wakeups_sync = 0;
2598 p->se.nr_wakeups_migrate = 0;
2599 p->se.nr_wakeups_local = 0;
2600 p->se.nr_wakeups_remote = 0;
2601 p->se.nr_wakeups_affine = 0;
2602 p->se.nr_wakeups_affine_attempts = 0;
2603 p->se.nr_wakeups_passive = 0;
2604 p->se.nr_wakeups_idle = 0;
2605
2498#endif 2606#endif
2499 2607
2500 INIT_LIST_HEAD(&p->rt.run_list); 2608 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2710,6 +2818,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2710 */ 2818 */
2711 prev_state = prev->state; 2819 prev_state = prev->state;
2712 finish_arch_switch(prev); 2820 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq));
2713 finish_lock_switch(rq, prev); 2822 finish_lock_switch(rq, prev);
2714#ifdef CONFIG_SMP 2823#ifdef CONFIG_SMP
2715 if (post_schedule) 2824 if (post_schedule)
@@ -2766,7 +2875,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2766 * combine the page table reload and the switch backend into 2875 * combine the page table reload and the switch backend into
2767 * one hypercall. 2876 * one hypercall.
2768 */ 2877 */
2769 arch_enter_lazy_cpu_mode(); 2878 arch_start_context_switch(prev);
2770 2879
2771 if (unlikely(!mm)) { 2880 if (unlikely(!mm)) {
2772 next->active_mm = oldmm; 2881 next->active_mm = oldmm;
@@ -2856,19 +2965,81 @@ unsigned long nr_iowait(void)
2856 return sum; 2965 return sum;
2857} 2966}
2858 2967
2859unsigned long nr_active(void) 2968/* Variables and functions for calc_load */
2969static atomic_long_t calc_load_tasks;
2970static unsigned long calc_load_update;
2971unsigned long avenrun[3];
2972EXPORT_SYMBOL(avenrun);
2973
2974/**
2975 * get_avenrun - get the load average array
2976 * @loads: pointer to dest load array
2977 * @offset: offset to add
2978 * @shift: shift count to shift the result left
2979 *
2980 * These values are estimates at best, so no need for locking.
2981 */
2982void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2860{ 2983{
2861 unsigned long i, running = 0, uninterruptible = 0; 2984 loads[0] = (avenrun[0] + offset) << shift;
2985 loads[1] = (avenrun[1] + offset) << shift;
2986 loads[2] = (avenrun[2] + offset) << shift;
2987}
2862 2988
2863 for_each_online_cpu(i) { 2989static unsigned long
2864 running += cpu_rq(i)->nr_running; 2990calc_load(unsigned long load, unsigned long exp, unsigned long active)
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2991{
2866 } 2992 load *= exp;
2993 load += active * (FIXED_1 - exp);
2994 return load >> FSHIFT;
2995}
2996
2997/*
2998 * calc_load - update the avenrun load estimates 10 ticks after the
2999 * CPUs have updated calc_load_tasks.
3000 */
3001void calc_global_load(void)
3002{
3003 unsigned long upd = calc_load_update + 10;
3004 long active;
3005
3006 if (time_before(jiffies, upd))
3007 return;
3008
3009 active = atomic_long_read(&calc_load_tasks);
3010 active = active > 0 ? active * FIXED_1 : 0;
2867 3011
2868 if (unlikely((long)uninterruptible < 0)) 3012 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2869 uninterruptible = 0; 3013 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3014 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2870 3015
2871 return running + uninterruptible; 3016 calc_load_update += LOAD_FREQ;
3017}
3018
3019/*
3020 * Either called from update_cpu_load() or from a cpu going idle
3021 */
3022static void calc_load_account_active(struct rq *this_rq)
3023{
3024 long nr_active, delta;
3025
3026 nr_active = this_rq->nr_running;
3027 nr_active += (long) this_rq->nr_uninterruptible;
3028
3029 if (nr_active != this_rq->calc_load_active) {
3030 delta = nr_active - this_rq->calc_load_active;
3031 this_rq->calc_load_active = nr_active;
3032 atomic_long_add(delta, &calc_load_tasks);
3033 }
3034}
3035
3036/*
3037 * Externally visible per-cpu scheduler statistics:
3038 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3039 */
3040u64 cpu_nr_migrations(int cpu)
3041{
3042 return cpu_rq(cpu)->nr_migrations_in;
2872} 3043}
2873 3044
2874/* 3045/*
@@ -2899,6 +3070,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 3070 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3071 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 3072 }
3073
3074 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3075 this_rq->calc_load_update += LOAD_FREQ;
3076 calc_load_account_active(this_rq);
3077 }
2902} 3078}
2903 3079
2904#ifdef CONFIG_SMP 3080#ifdef CONFIG_SMP
@@ -4240,10 +4416,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4416static struct {
4241 atomic_t load_balancer; 4417 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4418 cpumask_var_t cpu_mask;
4419 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4420} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4421 .load_balancer = ATOMIC_INIT(-1),
4245}; 4422};
4246 4423
4424int get_nohz_load_balancer(void)
4425{
4426 return atomic_read(&nohz.load_balancer);
4427}
4428
4429#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4430/**
4431 * lowest_flag_domain - Return lowest sched_domain containing flag.
4432 * @cpu: The cpu whose lowest level of sched domain is to
4433 * be returned.
4434 * @flag: The flag to check for the lowest sched_domain
4435 * for the given cpu.
4436 *
4437 * Returns the lowest sched_domain of a cpu which contains the given flag.
4438 */
4439static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4440{
4441 struct sched_domain *sd;
4442
4443 for_each_domain(cpu, sd)
4444 if (sd && (sd->flags & flag))
4445 break;
4446
4447 return sd;
4448}
4449
4450/**
4451 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4452 * @cpu: The cpu whose domains we're iterating over.
4453 * @sd: variable holding the value of the power_savings_sd
4454 * for cpu.
4455 * @flag: The flag to filter the sched_domains to be iterated.
4456 *
4457 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4458 * set, starting from the lowest sched_domain to the highest.
4459 */
4460#define for_each_flag_domain(cpu, sd, flag) \
4461 for (sd = lowest_flag_domain(cpu, flag); \
4462 (sd && (sd->flags & flag)); sd = sd->parent)
4463
4464/**
4465 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4466 * @ilb_group: group to be checked for semi-idleness
4467 *
4468 * Returns: 1 if the group is semi-idle. 0 otherwise.
4469 *
4470 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4471 * and atleast one non-idle CPU. This helper function checks if the given
4472 * sched_group is semi-idle or not.
4473 */
4474static inline int is_semi_idle_group(struct sched_group *ilb_group)
4475{
4476 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4477 sched_group_cpus(ilb_group));
4478
4479 /*
4480 * A sched_group is semi-idle when it has atleast one busy cpu
4481 * and atleast one idle cpu.
4482 */
4483 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4484 return 0;
4485
4486 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4487 return 0;
4488
4489 return 1;
4490}
4491/**
4492 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4493 * @cpu: The cpu which is nominating a new idle_load_balancer.
4494 *
4495 * Returns: Returns the id of the idle load balancer if it exists,
4496 * Else, returns >= nr_cpu_ids.
4497 *
4498 * This algorithm picks the idle load balancer such that it belongs to a
4499 * semi-idle powersavings sched_domain. The idea is to try and avoid
4500 * completely idle packages/cores just for the purpose of idle load balancing
4501 * when there are other idle cpu's which are better suited for that job.
4502 */
4503static int find_new_ilb(int cpu)
4504{
4505 struct sched_domain *sd;
4506 struct sched_group *ilb_group;
4507
4508 /*
4509 * Have idle load balancer selection from semi-idle packages only
4510 * when power-aware load balancing is enabled
4511 */
4512 if (!(sched_smt_power_savings || sched_mc_power_savings))
4513 goto out_done;
4514
4515 /*
4516 * Optimize for the case when we have no idle CPUs or only one
4517 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4518 */
4519 if (cpumask_weight(nohz.cpu_mask) < 2)
4520 goto out_done;
4521
4522 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4523 ilb_group = sd->groups;
4524
4525 do {
4526 if (is_semi_idle_group(ilb_group))
4527 return cpumask_first(nohz.ilb_grp_nohz_mask);
4528
4529 ilb_group = ilb_group->next;
4530
4531 } while (ilb_group != sd->groups);
4532 }
4533
4534out_done:
4535 return cpumask_first(nohz.cpu_mask);
4536}
4537#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4538static inline int find_new_ilb(int call_cpu)
4539{
4540 return cpumask_first(nohz.cpu_mask);
4541}
4542#endif
4543
4247/* 4544/*
4248 * This routine will try to nominate the ilb (idle load balancing) 4545 * This routine will try to nominate the ilb (idle load balancing)
4249 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4546 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4595,24 @@ int select_nohz_load_balancer(int stop_tick)
4298 /* make me the ilb owner */ 4595 /* make me the ilb owner */
4299 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4596 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300 return 1; 4597 return 1;
4301 } else if (atomic_read(&nohz.load_balancer) == cpu) 4598 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4599 int new_ilb;
4600
4601 if (!(sched_smt_power_savings ||
4602 sched_mc_power_savings))
4603 return 1;
4604 /*
4605 * Check to see if there is a more power-efficient
4606 * ilb.
4607 */
4608 new_ilb = find_new_ilb(cpu);
4609 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4610 atomic_set(&nohz.load_balancer, -1);
4611 resched_cpu(new_ilb);
4612 return 0;
4613 }
4302 return 1; 4614 return 1;
4615 }
4303 } else { 4616 } else {
4304 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4617 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305 return 0; 4618 return 0;
@@ -4468,15 +4781,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468 } 4781 }
4469 4782
4470 if (atomic_read(&nohz.load_balancer) == -1) { 4783 if (atomic_read(&nohz.load_balancer) == -1) {
4471 /* 4784 int ilb = find_new_ilb(cpu);
4472 * simple selection for now: Nominate the
4473 * first cpu in the nohz list to be the next
4474 * ilb owner.
4475 *
4476 * TBD: Traverse the sched domains and nominate
4477 * the nearest cpu in the nohz.cpu_mask.
4478 */
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480 4785
4481 if (ilb < nr_cpu_ids) 4786 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb); 4787 resched_cpu(ilb);
@@ -4840,6 +5145,8 @@ void scheduler_tick(void)
4840 curr->sched_class->task_tick(rq, curr, 0); 5145 curr->sched_class->task_tick(rq, curr, 0);
4841 spin_unlock(&rq->lock); 5146 spin_unlock(&rq->lock);
4842 5147
5148 perf_counter_task_tick(curr, cpu);
5149
4843#ifdef CONFIG_SMP 5150#ifdef CONFIG_SMP
4844 rq->idle_at_tick = idle_cpu(cpu); 5151 rq->idle_at_tick = idle_cpu(cpu);
4845 trigger_load_balance(rq, cpu); 5152 trigger_load_balance(rq, cpu);
@@ -5007,13 +5314,15 @@ pick_next_task(struct rq *rq)
5007/* 5314/*
5008 * schedule() is the main scheduler function. 5315 * schedule() is the main scheduler function.
5009 */ 5316 */
5010asmlinkage void __sched __schedule(void) 5317asmlinkage void __sched schedule(void)
5011{ 5318{
5012 struct task_struct *prev, *next; 5319 struct task_struct *prev, *next;
5013 unsigned long *switch_count; 5320 unsigned long *switch_count;
5014 struct rq *rq; 5321 struct rq *rq;
5015 int cpu; 5322 int cpu;
5016 5323
5324need_resched:
5325 preempt_disable();
5017 cpu = smp_processor_id(); 5326 cpu = smp_processor_id();
5018 rq = cpu_rq(cpu); 5327 rq = cpu_rq(cpu);
5019 rcu_qsctr_inc(cpu); 5328 rcu_qsctr_inc(cpu);
@@ -5053,6 +5362,7 @@ need_resched_nonpreemptible:
5053 5362
5054 if (likely(prev != next)) { 5363 if (likely(prev != next)) {
5055 sched_info_switch(prev, next); 5364 sched_info_switch(prev, next);
5365 perf_counter_task_sched_out(prev, next, cpu);
5056 5366
5057 rq->nr_switches++; 5367 rq->nr_switches++;
5058 rq->curr = next; 5368 rq->curr = next;
@@ -5070,15 +5380,9 @@ need_resched_nonpreemptible:
5070 5380
5071 if (unlikely(reacquire_kernel_lock(current) < 0)) 5381 if (unlikely(reacquire_kernel_lock(current) < 0))
5072 goto need_resched_nonpreemptible; 5382 goto need_resched_nonpreemptible;
5073}
5074 5383
5075asmlinkage void __sched schedule(void)
5076{
5077need_resched:
5078 preempt_disable();
5079 __schedule();
5080 preempt_enable_no_resched(); 5384 preempt_enable_no_resched();
5081 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5385 if (need_resched())
5082 goto need_resched; 5386 goto need_resched;
5083} 5387}
5084EXPORT_SYMBOL(schedule); 5388EXPORT_SYMBOL(schedule);
@@ -5221,7 +5525,7 @@ EXPORT_SYMBOL(default_wake_function);
5221 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5525 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5222 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5526 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5223 */ 5527 */
5224void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5528static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5225 int nr_exclusive, int sync, void *key) 5529 int nr_exclusive, int sync, void *key)
5226{ 5530{
5227 wait_queue_t *curr, *next; 5531 wait_queue_t *curr, *next;
@@ -5241,6 +5545,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5241 * @mode: which threads 5545 * @mode: which threads
5242 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5546 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5243 * @key: is directly passed to the wakeup function 5547 * @key: is directly passed to the wakeup function
5548 *
5549 * It may be assumed that this function implies a write memory barrier before
5550 * changing the task state if and only if any tasks are woken up.
5244 */ 5551 */
5245void __wake_up(wait_queue_head_t *q, unsigned int mode, 5552void __wake_up(wait_queue_head_t *q, unsigned int mode,
5246 int nr_exclusive, void *key) 5553 int nr_exclusive, void *key)
@@ -5279,6 +5586,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5279 * with each other. This can prevent needless bouncing between CPUs. 5586 * with each other. This can prevent needless bouncing between CPUs.
5280 * 5587 *
5281 * On UP it can prevent extra preemption. 5588 * On UP it can prevent extra preemption.
5589 *
5590 * It may be assumed that this function implies a write memory barrier before
5591 * changing the task state if and only if any tasks are woken up.
5282 */ 5592 */
5283void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 5593void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5284 int nr_exclusive, void *key) 5594 int nr_exclusive, void *key)
@@ -5315,6 +5625,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5315 * awakened in the same order in which they were queued. 5625 * awakened in the same order in which they were queued.
5316 * 5626 *
5317 * See also complete_all(), wait_for_completion() and related routines. 5627 * See also complete_all(), wait_for_completion() and related routines.
5628 *
5629 * It may be assumed that this function implies a write memory barrier before
5630 * changing the task state if and only if any tasks are woken up.
5318 */ 5631 */
5319void complete(struct completion *x) 5632void complete(struct completion *x)
5320{ 5633{
@@ -5332,6 +5645,9 @@ EXPORT_SYMBOL(complete);
5332 * @x: holds the state of this particular completion 5645 * @x: holds the state of this particular completion
5333 * 5646 *
5334 * This will wake up all threads waiting on this particular completion event. 5647 * This will wake up all threads waiting on this particular completion event.
5648 *
5649 * It may be assumed that this function implies a write memory barrier before
5650 * changing the task state if and only if any tasks are woken up.
5335 */ 5651 */
5336void complete_all(struct completion *x) 5652void complete_all(struct completion *x)
5337{ 5653{
@@ -6248,6 +6564,11 @@ SYSCALL_DEFINE0(sched_yield)
6248 return 0; 6564 return 0;
6249} 6565}
6250 6566
6567static inline int should_resched(void)
6568{
6569 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6570}
6571
6251static void __cond_resched(void) 6572static void __cond_resched(void)
6252{ 6573{
6253#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
@@ -6267,8 +6588,7 @@ static void __cond_resched(void)
6267 6588
6268int __sched _cond_resched(void) 6589int __sched _cond_resched(void)
6269{ 6590{
6270 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 6591 if (should_resched()) {
6271 system_state == SYSTEM_RUNNING) {
6272 __cond_resched(); 6592 __cond_resched();
6273 return 1; 6593 return 1;
6274 } 6594 }
@@ -6286,12 +6606,12 @@ EXPORT_SYMBOL(_cond_resched);
6286 */ 6606 */
6287int cond_resched_lock(spinlock_t *lock) 6607int cond_resched_lock(spinlock_t *lock)
6288{ 6608{
6289 int resched = need_resched() && system_state == SYSTEM_RUNNING; 6609 int resched = should_resched();
6290 int ret = 0; 6610 int ret = 0;
6291 6611
6292 if (spin_needbreak(lock) || resched) { 6612 if (spin_needbreak(lock) || resched) {
6293 spin_unlock(lock); 6613 spin_unlock(lock);
6294 if (resched && need_resched()) 6614 if (resched)
6295 __cond_resched(); 6615 __cond_resched();
6296 else 6616 else
6297 cpu_relax(); 6617 cpu_relax();
@@ -6306,7 +6626,7 @@ int __sched cond_resched_softirq(void)
6306{ 6626{
6307 BUG_ON(!in_softirq()); 6627 BUG_ON(!in_softirq());
6308 6628
6309 if (need_resched() && system_state == SYSTEM_RUNNING) { 6629 if (should_resched()) {
6310 local_bh_enable(); 6630 local_bh_enable();
6311 __cond_resched(); 6631 __cond_resched();
6312 local_bh_disable(); 6632 local_bh_disable();
@@ -6490,8 +6810,9 @@ void sched_show_task(struct task_struct *p)
6490#ifdef CONFIG_DEBUG_STACK_USAGE 6810#ifdef CONFIG_DEBUG_STACK_USAGE
6491 free = stack_not_used(p); 6811 free = stack_not_used(p);
6492#endif 6812#endif
6493 printk(KERN_CONT "%5lu %5d %6d\n", free, 6813 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6494 task_pid_nr(p), task_pid_nr(p->real_parent)); 6814 task_pid_nr(p), task_pid_nr(p->real_parent),
6815 (unsigned long)task_thread_info(p)->flags);
6495 6816
6496 show_stack(p, NULL); 6817 show_stack(p, NULL);
6497} 6818}
@@ -6752,7 +7073,7 @@ static int migration_thread(void *data)
6752 7073
6753 if (cpu_is_offline(cpu)) { 7074 if (cpu_is_offline(cpu)) {
6754 spin_unlock_irq(&rq->lock); 7075 spin_unlock_irq(&rq->lock);
6755 goto wait_to_die; 7076 break;
6756 } 7077 }
6757 7078
6758 if (rq->active_balance) { 7079 if (rq->active_balance) {
@@ -6778,16 +7099,7 @@ static int migration_thread(void *data)
6778 complete(&req->done); 7099 complete(&req->done);
6779 } 7100 }
6780 __set_current_state(TASK_RUNNING); 7101 __set_current_state(TASK_RUNNING);
6781 return 0;
6782 7102
6783wait_to_die:
6784 /* Wait for kthread_stop */
6785 set_current_state(TASK_INTERRUPTIBLE);
6786 while (!kthread_should_stop()) {
6787 schedule();
6788 set_current_state(TASK_INTERRUPTIBLE);
6789 }
6790 __set_current_state(TASK_RUNNING);
6791 return 0; 7103 return 0;
6792} 7104}
6793 7105
@@ -6970,6 +7282,15 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6970 7282
6971 } 7283 }
6972} 7284}
7285
7286/*
7287 * remove the tasks which were accounted by rq from calc_load_tasks.
7288 */
7289static void calc_global_load_remove(struct rq *rq)
7290{
7291 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7292 rq->calc_load_active = 0;
7293}
6973#endif /* CONFIG_HOTPLUG_CPU */ 7294#endif /* CONFIG_HOTPLUG_CPU */
6974 7295
6975#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7296#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7193,7 +7514,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7193 rq = task_rq_lock(p, &flags); 7514 rq = task_rq_lock(p, &flags);
7194 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7515 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7195 task_rq_unlock(rq, &flags); 7516 task_rq_unlock(rq, &flags);
7517 get_task_struct(p);
7196 cpu_rq(cpu)->migration_thread = p; 7518 cpu_rq(cpu)->migration_thread = p;
7519 rq->calc_load_update = calc_load_update;
7197 break; 7520 break;
7198 7521
7199 case CPU_ONLINE: 7522 case CPU_ONLINE:
@@ -7221,6 +7544,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7221 kthread_bind(cpu_rq(cpu)->migration_thread, 7544 kthread_bind(cpu_rq(cpu)->migration_thread,
7222 cpumask_any(cpu_online_mask)); 7545 cpumask_any(cpu_online_mask));
7223 kthread_stop(cpu_rq(cpu)->migration_thread); 7546 kthread_stop(cpu_rq(cpu)->migration_thread);
7547 put_task_struct(cpu_rq(cpu)->migration_thread);
7224 cpu_rq(cpu)->migration_thread = NULL; 7548 cpu_rq(cpu)->migration_thread = NULL;
7225 break; 7549 break;
7226 7550
@@ -7230,6 +7554,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7230 migrate_live_tasks(cpu); 7554 migrate_live_tasks(cpu);
7231 rq = cpu_rq(cpu); 7555 rq = cpu_rq(cpu);
7232 kthread_stop(rq->migration_thread); 7556 kthread_stop(rq->migration_thread);
7557 put_task_struct(rq->migration_thread);
7233 rq->migration_thread = NULL; 7558 rq->migration_thread = NULL;
7234 /* Idle task back to normal (off runqueue, low prio) */ 7559 /* Idle task back to normal (off runqueue, low prio) */
7235 spin_lock_irq(&rq->lock); 7560 spin_lock_irq(&rq->lock);
@@ -7243,7 +7568,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7243 cpuset_unlock(); 7568 cpuset_unlock();
7244 migrate_nr_uninterruptible(rq); 7569 migrate_nr_uninterruptible(rq);
7245 BUG_ON(rq->nr_running != 0); 7570 BUG_ON(rq->nr_running != 0);
7246 7571 calc_global_load_remove(rq);
7247 /* 7572 /*
7248 * No need to migrate the tasks: it was best-effort if 7573 * No need to migrate the tasks: it was best-effort if
7249 * they didn't take sched_hotcpu_mutex. Just wake up 7574 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7279,8 +7604,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7279 return NOTIFY_OK; 7604 return NOTIFY_OK;
7280} 7605}
7281 7606
7282/* Register at highest priority so that task migration (migrate_all_tasks) 7607/*
7283 * happens before everything else. 7608 * Register at high priority so that task migration (migrate_all_tasks)
7609 * happens before everything else. This has to be lower priority than
7610 * the notifier in the perf_counter subsystem, though.
7284 */ 7611 */
7285static struct notifier_block __cpuinitdata migration_notifier = { 7612static struct notifier_block __cpuinitdata migration_notifier = {
7286 .notifier_call = migration_call, 7613 .notifier_call = migration_call,
@@ -7523,26 +7850,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7523 free_rootdomain(old_rd); 7850 free_rootdomain(old_rd);
7524} 7851}
7525 7852
7526static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7853static int init_rootdomain(struct root_domain *rd, bool bootmem)
7527{ 7854{
7855 gfp_t gfp = GFP_KERNEL;
7856
7528 memset(rd, 0, sizeof(*rd)); 7857 memset(rd, 0, sizeof(*rd));
7529 7858
7530 if (bootmem) { 7859 if (bootmem)
7531 alloc_bootmem_cpumask_var(&def_root_domain.span); 7860 gfp = GFP_NOWAIT;
7532 alloc_bootmem_cpumask_var(&def_root_domain.online);
7533 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
7534 cpupri_init(&rd->cpupri, true);
7535 return 0;
7536 }
7537 7861
7538 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 7862 if (!alloc_cpumask_var(&rd->span, gfp))
7539 goto out; 7863 goto out;
7540 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 7864 if (!alloc_cpumask_var(&rd->online, gfp))
7541 goto free_span; 7865 goto free_span;
7542 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 7866 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
7543 goto free_online; 7867 goto free_online;
7544 7868
7545 if (cpupri_init(&rd->cpupri, false) != 0) 7869 if (cpupri_init(&rd->cpupri, bootmem) != 0)
7546 goto free_rto_mask; 7870 goto free_rto_mask;
7547 return 0; 7871 return 0;
7548 7872
@@ -7753,8 +8077,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7753 8077
7754/* 8078/*
7755 * The cpus mask in sched_group and sched_domain hangs off the end. 8079 * The cpus mask in sched_group and sched_domain hangs off the end.
7756 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 8080 *
7757 * for nr_cpu_ids < CONFIG_NR_CPUS. 8081 * ( See the the comments in include/linux/sched.h:struct sched_group
8082 * and struct sched_domain. )
7758 */ 8083 */
7759struct static_sched_group { 8084struct static_sched_group {
7760 struct sched_group sg; 8085 struct sched_group sg;
@@ -7875,7 +8200,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7875 struct sched_domain *sd; 8200 struct sched_domain *sd;
7876 8201
7877 sd = &per_cpu(phys_domains, j).sd; 8202 sd = &per_cpu(phys_domains, j).sd;
7878 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8203 if (j != group_first_cpu(sd->groups)) {
7879 /* 8204 /*
7880 * Only add "power" once for each 8205 * Only add "power" once for each
7881 * physical package. 8206 * physical package.
@@ -7953,7 +8278,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7953 8278
7954 WARN_ON(!sd || !sd->groups); 8279 WARN_ON(!sd || !sd->groups);
7955 8280
7956 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8281 if (cpu != group_first_cpu(sd->groups))
7957 return; 8282 return;
7958 8283
7959 child = sd->child; 8284 child = sd->child;
@@ -8731,6 +9056,8 @@ void __init sched_init_smp(void)
8731} 9056}
8732#endif /* CONFIG_SMP */ 9057#endif /* CONFIG_SMP */
8733 9058
9059const_debug unsigned int sysctl_timer_migration = 1;
9060
8734int in_sched_functions(unsigned long addr) 9061int in_sched_functions(unsigned long addr)
8735{ 9062{
8736 return in_lock_functions(addr) || 9063 return in_lock_functions(addr) ||
@@ -8770,7 +9097,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8770#ifdef CONFIG_SMP 9097#ifdef CONFIG_SMP
8771 rt_rq->rt_nr_migratory = 0; 9098 rt_rq->rt_nr_migratory = 0;
8772 rt_rq->overloaded = 0; 9099 rt_rq->overloaded = 0;
8773 plist_head_init(&rq->rt.pushable_tasks, &rq->lock); 9100 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
8774#endif 9101#endif
8775 9102
8776 rt_rq->rt_time = 0; 9103 rt_rq->rt_time = 0;
@@ -8865,7 +9192,7 @@ void __init sched_init(void)
8865 * we use alloc_bootmem(). 9192 * we use alloc_bootmem().
8866 */ 9193 */
8867 if (alloc_size) { 9194 if (alloc_size) {
8868 ptr = (unsigned long)alloc_bootmem(alloc_size); 9195 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8869 9196
8870#ifdef CONFIG_FAIR_GROUP_SCHED 9197#ifdef CONFIG_FAIR_GROUP_SCHED
8871 init_task_group.se = (struct sched_entity **)ptr; 9198 init_task_group.se = (struct sched_entity **)ptr;
@@ -8938,6 +9265,8 @@ void __init sched_init(void)
8938 rq = cpu_rq(i); 9265 rq = cpu_rq(i);
8939 spin_lock_init(&rq->lock); 9266 spin_lock_init(&rq->lock);
8940 rq->nr_running = 0; 9267 rq->nr_running = 0;
9268 rq->calc_load_active = 0;
9269 rq->calc_load_update = jiffies + LOAD_FREQ;
8941 init_cfs_rq(&rq->cfs, rq); 9270 init_cfs_rq(&rq->cfs, rq);
8942 init_rt_rq(&rq->rt, rq); 9271 init_rt_rq(&rq->rt, rq);
8943#ifdef CONFIG_FAIR_GROUP_SCHED 9272#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8958,7 +9287,7 @@ void __init sched_init(void)
8958 * 1024) and two child groups A0 and A1 (of weight 1024 each), 9287 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8959 * then A0's share of the cpu resource is: 9288 * then A0's share of the cpu resource is:
8960 * 9289 *
8961 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 9290 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8962 * 9291 *
8963 * We achieve this by letting init_task_group's tasks sit 9292 * We achieve this by letting init_task_group's tasks sit
8964 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 9293 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9045,20 +9374,26 @@ void __init sched_init(void)
9045 * when this runqueue becomes "idle". 9374 * when this runqueue becomes "idle".
9046 */ 9375 */
9047 init_idle(current, smp_processor_id()); 9376 init_idle(current, smp_processor_id());
9377
9378 calc_load_update = jiffies + LOAD_FREQ;
9379
9048 /* 9380 /*
9049 * During early bootup we pretend to be a normal task: 9381 * During early bootup we pretend to be a normal task:
9050 */ 9382 */
9051 current->sched_class = &fair_sched_class; 9383 current->sched_class = &fair_sched_class;
9052 9384
9053 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9385 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9054 alloc_bootmem_cpumask_var(&nohz_cpu_mask); 9386 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9055#ifdef CONFIG_SMP 9387#ifdef CONFIG_SMP
9056#ifdef CONFIG_NO_HZ 9388#ifdef CONFIG_NO_HZ
9057 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9389 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9390 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9058#endif 9391#endif
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9392 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9060#endif /* SMP */ 9393#endif /* SMP */
9061 9394
9395 perf_counter_init();
9396
9062 scheduler_running = 1; 9397 scheduler_running = 1;
9063} 9398}
9064 9399
@@ -9800,6 +10135,13 @@ static int sched_rt_global_constraints(void)
9800 if (sysctl_sched_rt_period <= 0) 10135 if (sysctl_sched_rt_period <= 0)
9801 return -EINVAL; 10136 return -EINVAL;
9802 10137
10138 /*
10139 * There's always some RT tasks in the root group
10140 * -- migration, kstopmachine etc..
10141 */
10142 if (sysctl_sched_rt_runtime == 0)
10143 return -EBUSY;
10144
9803 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10145 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9804 for_each_possible_cpu(i) { 10146 for_each_possible_cpu(i) {
9805 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10147 struct rt_rq *rt_rq = &cpu_rq(i)->rt;