aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c359
1 files changed, 315 insertions, 44 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..14c447ae5d53 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -72,13 +72,15 @@
72#include <linux/debugfs.h> 72#include <linux/debugfs.h>
73#include <linux/ctype.h> 73#include <linux/ctype.h>
74#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
76 75
77#include <asm/tlb.h> 76#include <asm/tlb.h>
78#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
79 78
80#include "sched_cpupri.h" 79#include "sched_cpupri.h"
81 80
81#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h>
83
82/* 84/*
83 * Convert user-nice values [ -20 ... 0 ... 19 ] 85 * Convert user-nice values [ -20 ... 0 ... 19 ]
84 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 86 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
118 */ 120 */
119#define RUNTIME_INF ((u64)~0ULL) 121#define RUNTIME_INF ((u64)~0ULL)
120 122
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
127#ifdef CONFIG_SMP 123#ifdef CONFIG_SMP
128 124
129static void double_rq_lock(struct rq *rq1, struct rq *rq2); 125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -630,6 +626,10 @@ struct rq {
630 struct list_head migration_queue; 626 struct list_head migration_queue;
631#endif 627#endif
632 628
629 /* calc_load related fields */
630 unsigned long calc_load_update;
631 long calc_load_active;
632
633#ifdef CONFIG_SCHED_HRTICK 633#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 634#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 635 int hrtick_csd_pending;
@@ -1728,6 +1728,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1728}
1729#endif 1729#endif
1730 1730
1731static void calc_load_account_active(struct rq *this_rq);
1732
1731#include "sched_stats.h" 1733#include "sched_stats.h"
1732#include "sched_idletask.c" 1734#include "sched_idletask.c"
1733#include "sched_fair.c" 1735#include "sched_fair.c"
@@ -1958,7 +1960,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1958 1960
1959 clock_offset = old_rq->clock - new_rq->clock; 1961 clock_offset = old_rq->clock - new_rq->clock;
1960 1962
1961 trace_sched_migrate_task(p, task_cpu(p), new_cpu); 1963 trace_sched_migrate_task(p, new_cpu);
1962 1964
1963#ifdef CONFIG_SCHEDSTATS 1965#ifdef CONFIG_SCHEDSTATS
1964 if (p->se.wait_start) 1966 if (p->se.wait_start)
@@ -2015,6 +2017,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2015} 2017}
2016 2018
2017/* 2019/*
2020 * wait_task_context_switch - wait for a thread to complete at least one
2021 * context switch.
2022 *
2023 * @p must not be current.
2024 */
2025void wait_task_context_switch(struct task_struct *p)
2026{
2027 unsigned long nvcsw, nivcsw, flags;
2028 int running;
2029 struct rq *rq;
2030
2031 nvcsw = p->nvcsw;
2032 nivcsw = p->nivcsw;
2033 for (;;) {
2034 /*
2035 * The runqueue is assigned before the actual context
2036 * switch. We need to take the runqueue lock.
2037 *
2038 * We could check initially without the lock but it is
2039 * very likely that we need to take the lock in every
2040 * iteration.
2041 */
2042 rq = task_rq_lock(p, &flags);
2043 running = task_running(rq, p);
2044 task_rq_unlock(rq, &flags);
2045
2046 if (likely(!running))
2047 break;
2048 /*
2049 * The switch count is incremented before the actual
2050 * context switch. We thus wait for two switches to be
2051 * sure at least one completed.
2052 */
2053 if ((p->nvcsw - nvcsw) > 1)
2054 break;
2055 if ((p->nivcsw - nivcsw) > 1)
2056 break;
2057
2058 cpu_relax();
2059 }
2060}
2061
2062/*
2018 * wait_task_inactive - wait for a thread to unschedule. 2063 * wait_task_inactive - wait for a thread to unschedule.
2019 * 2064 *
2020 * If @match_state is nonzero, it's the @p->state value just checked and 2065 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2458,6 +2503,17 @@ out:
2458 return success; 2503 return success;
2459} 2504}
2460 2505
2506/**
2507 * wake_up_process - Wake up a specific process
2508 * @p: The process to be woken up.
2509 *
2510 * Attempt to wake up the nominated process and move it to the set of runnable
2511 * processes. Returns 1 if the process was woken up, 0 if it was already
2512 * running.
2513 *
2514 * It may be assumed that this function implies a write memory barrier before
2515 * changing the task state if and only if any tasks are woken up.
2516 */
2461int wake_up_process(struct task_struct *p) 2517int wake_up_process(struct task_struct *p)
2462{ 2518{
2463 return try_to_wake_up(p, TASK_ALL, 0); 2519 return try_to_wake_up(p, TASK_ALL, 0);
@@ -2766,7 +2822,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2766 * combine the page table reload and the switch backend into 2822 * combine the page table reload and the switch backend into
2767 * one hypercall. 2823 * one hypercall.
2768 */ 2824 */
2769 arch_enter_lazy_cpu_mode(); 2825 arch_start_context_switch(prev);
2770 2826
2771 if (unlikely(!mm)) { 2827 if (unlikely(!mm)) {
2772 next->active_mm = oldmm; 2828 next->active_mm = oldmm;
@@ -2856,19 +2912,72 @@ unsigned long nr_iowait(void)
2856 return sum; 2912 return sum;
2857} 2913}
2858 2914
2859unsigned long nr_active(void) 2915/* Variables and functions for calc_load */
2916static atomic_long_t calc_load_tasks;
2917static unsigned long calc_load_update;
2918unsigned long avenrun[3];
2919EXPORT_SYMBOL(avenrun);
2920
2921/**
2922 * get_avenrun - get the load average array
2923 * @loads: pointer to dest load array
2924 * @offset: offset to add
2925 * @shift: shift count to shift the result left
2926 *
2927 * These values are estimates at best, so no need for locking.
2928 */
2929void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2930{
2931 loads[0] = (avenrun[0] + offset) << shift;
2932 loads[1] = (avenrun[1] + offset) << shift;
2933 loads[2] = (avenrun[2] + offset) << shift;
2934}
2935
2936static unsigned long
2937calc_load(unsigned long load, unsigned long exp, unsigned long active)
2860{ 2938{
2861 unsigned long i, running = 0, uninterruptible = 0; 2939 load *= exp;
2940 load += active * (FIXED_1 - exp);
2941 return load >> FSHIFT;
2942}
2862 2943
2863 for_each_online_cpu(i) { 2944/*
2864 running += cpu_rq(i)->nr_running; 2945 * calc_load - update the avenrun load estimates 10 ticks after the
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2946 * CPUs have updated calc_load_tasks.
2866 } 2947 */
2948void calc_global_load(void)
2949{
2950 unsigned long upd = calc_load_update + 10;
2951 long active;
2867 2952
2868 if (unlikely((long)uninterruptible < 0)) 2953 if (time_before(jiffies, upd))
2869 uninterruptible = 0; 2954 return;
2870 2955
2871 return running + uninterruptible; 2956 active = atomic_long_read(&calc_load_tasks);
2957 active = active > 0 ? active * FIXED_1 : 0;
2958
2959 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2960 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2961 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2962
2963 calc_load_update += LOAD_FREQ;
2964}
2965
2966/*
2967 * Either called from update_cpu_load() or from a cpu going idle
2968 */
2969static void calc_load_account_active(struct rq *this_rq)
2970{
2971 long nr_active, delta;
2972
2973 nr_active = this_rq->nr_running;
2974 nr_active += (long) this_rq->nr_uninterruptible;
2975
2976 if (nr_active != this_rq->calc_load_active) {
2977 delta = nr_active - this_rq->calc_load_active;
2978 this_rq->calc_load_active = nr_active;
2979 atomic_long_add(delta, &calc_load_tasks);
2980 }
2872} 2981}
2873 2982
2874/* 2983/*
@@ -2899,6 +3008,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 3008 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3009 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 3010 }
3011
3012 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3013 this_rq->calc_load_update += LOAD_FREQ;
3014 calc_load_account_active(this_rq);
3015 }
2902} 3016}
2903 3017
2904#ifdef CONFIG_SMP 3018#ifdef CONFIG_SMP
@@ -4240,10 +4354,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4354static struct {
4241 atomic_t load_balancer; 4355 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4356 cpumask_var_t cpu_mask;
4357 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4358} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4359 .load_balancer = ATOMIC_INIT(-1),
4245}; 4360};
4246 4361
4362#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4363/**
4364 * lowest_flag_domain - Return lowest sched_domain containing flag.
4365 * @cpu: The cpu whose lowest level of sched domain is to
4366 * be returned.
4367 * @flag: The flag to check for the lowest sched_domain
4368 * for the given cpu.
4369 *
4370 * Returns the lowest sched_domain of a cpu which contains the given flag.
4371 */
4372static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4373{
4374 struct sched_domain *sd;
4375
4376 for_each_domain(cpu, sd)
4377 if (sd && (sd->flags & flag))
4378 break;
4379
4380 return sd;
4381}
4382
4383/**
4384 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4385 * @cpu: The cpu whose domains we're iterating over.
4386 * @sd: variable holding the value of the power_savings_sd
4387 * for cpu.
4388 * @flag: The flag to filter the sched_domains to be iterated.
4389 *
4390 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4391 * set, starting from the lowest sched_domain to the highest.
4392 */
4393#define for_each_flag_domain(cpu, sd, flag) \
4394 for (sd = lowest_flag_domain(cpu, flag); \
4395 (sd && (sd->flags & flag)); sd = sd->parent)
4396
4397/**
4398 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4399 * @ilb_group: group to be checked for semi-idleness
4400 *
4401 * Returns: 1 if the group is semi-idle. 0 otherwise.
4402 *
4403 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4404 * and atleast one non-idle CPU. This helper function checks if the given
4405 * sched_group is semi-idle or not.
4406 */
4407static inline int is_semi_idle_group(struct sched_group *ilb_group)
4408{
4409 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4410 sched_group_cpus(ilb_group));
4411
4412 /*
4413 * A sched_group is semi-idle when it has atleast one busy cpu
4414 * and atleast one idle cpu.
4415 */
4416 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4417 return 0;
4418
4419 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4420 return 0;
4421
4422 return 1;
4423}
4424/**
4425 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4426 * @cpu: The cpu which is nominating a new idle_load_balancer.
4427 *
4428 * Returns: Returns the id of the idle load balancer if it exists,
4429 * Else, returns >= nr_cpu_ids.
4430 *
4431 * This algorithm picks the idle load balancer such that it belongs to a
4432 * semi-idle powersavings sched_domain. The idea is to try and avoid
4433 * completely idle packages/cores just for the purpose of idle load balancing
4434 * when there are other idle cpu's which are better suited for that job.
4435 */
4436static int find_new_ilb(int cpu)
4437{
4438 struct sched_domain *sd;
4439 struct sched_group *ilb_group;
4440
4441 /*
4442 * Have idle load balancer selection from semi-idle packages only
4443 * when power-aware load balancing is enabled
4444 */
4445 if (!(sched_smt_power_savings || sched_mc_power_savings))
4446 goto out_done;
4447
4448 /*
4449 * Optimize for the case when we have no idle CPUs or only one
4450 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4451 */
4452 if (cpumask_weight(nohz.cpu_mask) < 2)
4453 goto out_done;
4454
4455 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4456 ilb_group = sd->groups;
4457
4458 do {
4459 if (is_semi_idle_group(ilb_group))
4460 return cpumask_first(nohz.ilb_grp_nohz_mask);
4461
4462 ilb_group = ilb_group->next;
4463
4464 } while (ilb_group != sd->groups);
4465 }
4466
4467out_done:
4468 return cpumask_first(nohz.cpu_mask);
4469}
4470#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4471static inline int find_new_ilb(int call_cpu)
4472{
4473 return cpumask_first(nohz.cpu_mask);
4474}
4475#endif
4476
4247/* 4477/*
4248 * This routine will try to nominate the ilb (idle load balancing) 4478 * This routine will try to nominate the ilb (idle load balancing)
4249 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4479 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4528,24 @@ int select_nohz_load_balancer(int stop_tick)
4298 /* make me the ilb owner */ 4528 /* make me the ilb owner */
4299 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4529 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300 return 1; 4530 return 1;
4301 } else if (atomic_read(&nohz.load_balancer) == cpu) 4531 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4532 int new_ilb;
4533
4534 if (!(sched_smt_power_savings ||
4535 sched_mc_power_savings))
4536 return 1;
4537 /*
4538 * Check to see if there is a more power-efficient
4539 * ilb.
4540 */
4541 new_ilb = find_new_ilb(cpu);
4542 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4543 atomic_set(&nohz.load_balancer, -1);
4544 resched_cpu(new_ilb);
4545 return 0;
4546 }
4302 return 1; 4547 return 1;
4548 }
4303 } else { 4549 } else {
4304 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4550 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305 return 0; 4551 return 0;
@@ -4468,15 +4714,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468 } 4714 }
4469 4715
4470 if (atomic_read(&nohz.load_balancer) == -1) { 4716 if (atomic_read(&nohz.load_balancer) == -1) {
4471 /* 4717 int ilb = find_new_ilb(cpu);
4472 * simple selection for now: Nominate the
4473 * first cpu in the nohz list to be the next
4474 * ilb owner.
4475 *
4476 * TBD: Traverse the sched domains and nominate
4477 * the nearest cpu in the nohz.cpu_mask.
4478 */
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480 4718
4481 if (ilb < nr_cpu_ids) 4719 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb); 4720 resched_cpu(ilb);
@@ -5007,13 +5245,15 @@ pick_next_task(struct rq *rq)
5007/* 5245/*
5008 * schedule() is the main scheduler function. 5246 * schedule() is the main scheduler function.
5009 */ 5247 */
5010asmlinkage void __sched __schedule(void) 5248asmlinkage void __sched schedule(void)
5011{ 5249{
5012 struct task_struct *prev, *next; 5250 struct task_struct *prev, *next;
5013 unsigned long *switch_count; 5251 unsigned long *switch_count;
5014 struct rq *rq; 5252 struct rq *rq;
5015 int cpu; 5253 int cpu;
5016 5254
5255need_resched:
5256 preempt_disable();
5017 cpu = smp_processor_id(); 5257 cpu = smp_processor_id();
5018 rq = cpu_rq(cpu); 5258 rq = cpu_rq(cpu);
5019 rcu_qsctr_inc(cpu); 5259 rcu_qsctr_inc(cpu);
@@ -5070,15 +5310,9 @@ need_resched_nonpreemptible:
5070 5310
5071 if (unlikely(reacquire_kernel_lock(current) < 0)) 5311 if (unlikely(reacquire_kernel_lock(current) < 0))
5072 goto need_resched_nonpreemptible; 5312 goto need_resched_nonpreemptible;
5073}
5074 5313
5075asmlinkage void __sched schedule(void)
5076{
5077need_resched:
5078 preempt_disable();
5079 __schedule();
5080 preempt_enable_no_resched(); 5314 preempt_enable_no_resched();
5081 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5315 if (need_resched())
5082 goto need_resched; 5316 goto need_resched;
5083} 5317}
5084EXPORT_SYMBOL(schedule); 5318EXPORT_SYMBOL(schedule);
@@ -5221,7 +5455,7 @@ EXPORT_SYMBOL(default_wake_function);
5221 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5455 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5222 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5456 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5223 */ 5457 */
5224void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5458static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5225 int nr_exclusive, int sync, void *key) 5459 int nr_exclusive, int sync, void *key)
5226{ 5460{
5227 wait_queue_t *curr, *next; 5461 wait_queue_t *curr, *next;
@@ -5241,6 +5475,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5241 * @mode: which threads 5475 * @mode: which threads
5242 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5476 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5243 * @key: is directly passed to the wakeup function 5477 * @key: is directly passed to the wakeup function
5478 *
5479 * It may be assumed that this function implies a write memory barrier before
5480 * changing the task state if and only if any tasks are woken up.
5244 */ 5481 */
5245void __wake_up(wait_queue_head_t *q, unsigned int mode, 5482void __wake_up(wait_queue_head_t *q, unsigned int mode,
5246 int nr_exclusive, void *key) 5483 int nr_exclusive, void *key)
@@ -5279,6 +5516,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5279 * with each other. This can prevent needless bouncing between CPUs. 5516 * with each other. This can prevent needless bouncing between CPUs.
5280 * 5517 *
5281 * On UP it can prevent extra preemption. 5518 * On UP it can prevent extra preemption.
5519 *
5520 * It may be assumed that this function implies a write memory barrier before
5521 * changing the task state if and only if any tasks are woken up.
5282 */ 5522 */
5283void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 5523void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5284 int nr_exclusive, void *key) 5524 int nr_exclusive, void *key)
@@ -5315,6 +5555,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5315 * awakened in the same order in which they were queued. 5555 * awakened in the same order in which they were queued.
5316 * 5556 *
5317 * See also complete_all(), wait_for_completion() and related routines. 5557 * See also complete_all(), wait_for_completion() and related routines.
5558 *
5559 * It may be assumed that this function implies a write memory barrier before
5560 * changing the task state if and only if any tasks are woken up.
5318 */ 5561 */
5319void complete(struct completion *x) 5562void complete(struct completion *x)
5320{ 5563{
@@ -5332,6 +5575,9 @@ EXPORT_SYMBOL(complete);
5332 * @x: holds the state of this particular completion 5575 * @x: holds the state of this particular completion
5333 * 5576 *
5334 * This will wake up all threads waiting on this particular completion event. 5577 * This will wake up all threads waiting on this particular completion event.
5578 *
5579 * It may be assumed that this function implies a write memory barrier before
5580 * changing the task state if and only if any tasks are woken up.
5335 */ 5581 */
5336void complete_all(struct completion *x) 5582void complete_all(struct completion *x)
5337{ 5583{
@@ -6490,8 +6736,9 @@ void sched_show_task(struct task_struct *p)
6490#ifdef CONFIG_DEBUG_STACK_USAGE 6736#ifdef CONFIG_DEBUG_STACK_USAGE
6491 free = stack_not_used(p); 6737 free = stack_not_used(p);
6492#endif 6738#endif
6493 printk(KERN_CONT "%5lu %5d %6d\n", free, 6739 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6494 task_pid_nr(p), task_pid_nr(p->real_parent)); 6740 task_pid_nr(p), task_pid_nr(p->real_parent),
6741 (unsigned long)task_thread_info(p)->flags);
6495 6742
6496 show_stack(p, NULL); 6743 show_stack(p, NULL);
6497} 6744}
@@ -6970,6 +7217,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6970 7217
6971 } 7218 }
6972} 7219}
7220
7221/*
7222 * remove the tasks which were accounted by rq from calc_load_tasks.
7223 */
7224static void calc_global_load_remove(struct rq *rq)
7225{
7226 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7227}
6973#endif /* CONFIG_HOTPLUG_CPU */ 7228#endif /* CONFIG_HOTPLUG_CPU */
6974 7229
6975#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7230#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7459,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7204 /* Update our root-domain */ 7459 /* Update our root-domain */
7205 rq = cpu_rq(cpu); 7460 rq = cpu_rq(cpu);
7206 spin_lock_irqsave(&rq->lock, flags); 7461 spin_lock_irqsave(&rq->lock, flags);
7462 rq->calc_load_update = calc_load_update;
7463 rq->calc_load_active = 0;
7207 if (rq->rd) { 7464 if (rq->rd) {
7208 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7465 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7209 7466
@@ -7243,7 +7500,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7243 cpuset_unlock(); 7500 cpuset_unlock();
7244 migrate_nr_uninterruptible(rq); 7501 migrate_nr_uninterruptible(rq);
7245 BUG_ON(rq->nr_running != 0); 7502 BUG_ON(rq->nr_running != 0);
7246 7503 calc_global_load_remove(rq);
7247 /* 7504 /*
7248 * No need to migrate the tasks: it was best-effort if 7505 * No need to migrate the tasks: it was best-effort if
7249 * they didn't take sched_hotcpu_mutex. Just wake up 7506 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7753,8 +8010,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7753 8010
7754/* 8011/*
7755 * The cpus mask in sched_group and sched_domain hangs off the end. 8012 * The cpus mask in sched_group and sched_domain hangs off the end.
7756 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 8013 *
7757 * for nr_cpu_ids < CONFIG_NR_CPUS. 8014 * ( See the the comments in include/linux/sched.h:struct sched_group
8015 * and struct sched_domain. )
7758 */ 8016 */
7759struct static_sched_group { 8017struct static_sched_group {
7760 struct sched_group sg; 8018 struct sched_group sg;
@@ -7875,7 +8133,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7875 struct sched_domain *sd; 8133 struct sched_domain *sd;
7876 8134
7877 sd = &per_cpu(phys_domains, j).sd; 8135 sd = &per_cpu(phys_domains, j).sd;
7878 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8136 if (j != group_first_cpu(sd->groups)) {
7879 /* 8137 /*
7880 * Only add "power" once for each 8138 * Only add "power" once for each
7881 * physical package. 8139 * physical package.
@@ -7953,7 +8211,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7953 8211
7954 WARN_ON(!sd || !sd->groups); 8212 WARN_ON(!sd || !sd->groups);
7955 8213
7956 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8214 if (cpu != group_first_cpu(sd->groups))
7957 return; 8215 return;
7958 8216
7959 child = sd->child; 8217 child = sd->child;
@@ -8938,6 +9196,8 @@ void __init sched_init(void)
8938 rq = cpu_rq(i); 9196 rq = cpu_rq(i);
8939 spin_lock_init(&rq->lock); 9197 spin_lock_init(&rq->lock);
8940 rq->nr_running = 0; 9198 rq->nr_running = 0;
9199 rq->calc_load_active = 0;
9200 rq->calc_load_update = jiffies + LOAD_FREQ;
8941 init_cfs_rq(&rq->cfs, rq); 9201 init_cfs_rq(&rq->cfs, rq);
8942 init_rt_rq(&rq->rt, rq); 9202 init_rt_rq(&rq->rt, rq);
8943#ifdef CONFIG_FAIR_GROUP_SCHED 9203#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9045,6 +9305,9 @@ void __init sched_init(void)
9045 * when this runqueue becomes "idle". 9305 * when this runqueue becomes "idle".
9046 */ 9306 */
9047 init_idle(current, smp_processor_id()); 9307 init_idle(current, smp_processor_id());
9308
9309 calc_load_update = jiffies + LOAD_FREQ;
9310
9048 /* 9311 /*
9049 * During early bootup we pretend to be a normal task: 9312 * During early bootup we pretend to be a normal task:
9050 */ 9313 */
@@ -9055,6 +9318,7 @@ void __init sched_init(void)
9055#ifdef CONFIG_SMP 9318#ifdef CONFIG_SMP
9056#ifdef CONFIG_NO_HZ 9319#ifdef CONFIG_NO_HZ
9057 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9320 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
9321 alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
9058#endif 9322#endif
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9323 alloc_bootmem_cpumask_var(&cpu_isolated_map);
9060#endif /* SMP */ 9324#endif /* SMP */
@@ -9800,6 +10064,13 @@ static int sched_rt_global_constraints(void)
9800 if (sysctl_sched_rt_period <= 0) 10064 if (sysctl_sched_rt_period <= 0)
9801 return -EINVAL; 10065 return -EINVAL;
9802 10066
10067 /*
10068 * There's always some RT tasks in the root group
10069 * -- migration, kstopmachine etc..
10070 */
10071 if (sysctl_sched_rt_runtime == 0)
10072 return -EBUSY;
10073
9803 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10074 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9804 for_each_possible_cpu(i) { 10075 for_each_possible_cpu(i) {
9805 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10076 struct rt_rq *rt_rq = &cpu_rq(i)->rt;