aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-06-11 11:55:42 -0400
committerIngo Molnar <mingo@elte.hu>2009-06-11 11:55:42 -0400
commit940010c5a314a7bd9b498593bc6ba1718ac5aec5 (patch)
treed141e08ced08c40c6a8e3ab2cdecde5ff14e560f /kernel/sched.c
parent8dc8e5e8bc0ce00b0f656bf972f67cd8a72759e5 (diff)
parent991ec02cdca33b03a132a0cacfe6f0aa0be9aa8d (diff)
Merge branch 'linus' into perfcounters/core
Conflicts: arch/x86/kernel/irqinit.c arch/x86/kernel/irqinit_64.c arch/x86/kernel/traps.c arch/x86/mm/fault.c include/linux/sched.h kernel/exit.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c359
1 files changed, 315 insertions, 44 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 8d43347a0c0d..5b3f6ec1b0b3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -73,13 +73,15 @@
73#include <linux/debugfs.h> 73#include <linux/debugfs.h>
74#include <linux/ctype.h> 74#include <linux/ctype.h>
75#include <linux/ftrace.h> 75#include <linux/ftrace.h>
76#include <trace/sched.h>
77 76
78#include <asm/tlb.h> 77#include <asm/tlb.h>
79#include <asm/irq_regs.h> 78#include <asm/irq_regs.h>
80 79
81#include "sched_cpupri.h" 80#include "sched_cpupri.h"
82 81
82#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h>
84
83/* 85/*
84 * Convert user-nice values [ -20 ... 0 ... 19 ] 86 * Convert user-nice values [ -20 ... 0 ... 19 ]
85 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 87 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -119,12 +121,6 @@
119 */ 121 */
120#define RUNTIME_INF ((u64)~0ULL) 122#define RUNTIME_INF ((u64)~0ULL)
121 123
122DEFINE_TRACE(sched_wait_task);
123DEFINE_TRACE(sched_wakeup);
124DEFINE_TRACE(sched_wakeup_new);
125DEFINE_TRACE(sched_switch);
126DEFINE_TRACE(sched_migrate_task);
127
128#ifdef CONFIG_SMP 124#ifdef CONFIG_SMP
129 125
130static void double_rq_lock(struct rq *rq1, struct rq *rq2); 126static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -632,6 +628,10 @@ struct rq {
632 struct list_head migration_queue; 628 struct list_head migration_queue;
633#endif 629#endif
634 630
631 /* calc_load related fields */
632 unsigned long calc_load_update;
633 long calc_load_active;
634
635#ifdef CONFIG_SCHED_HRTICK 635#ifdef CONFIG_SCHED_HRTICK
636#ifdef CONFIG_SMP 636#ifdef CONFIG_SMP
637 int hrtick_csd_pending; 637 int hrtick_csd_pending;
@@ -1730,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1730} 1730}
1731#endif 1731#endif
1732 1732
1733static void calc_load_account_active(struct rq *this_rq);
1734
1733#include "sched_stats.h" 1735#include "sched_stats.h"
1734#include "sched_idletask.c" 1736#include "sched_idletask.c"
1735#include "sched_fair.c" 1737#include "sched_fair.c"
@@ -1960,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1960 1962
1961 clock_offset = old_rq->clock - new_rq->clock; 1963 clock_offset = old_rq->clock - new_rq->clock;
1962 1964
1963 trace_sched_migrate_task(p, task_cpu(p), new_cpu); 1965 trace_sched_migrate_task(p, new_cpu);
1964 1966
1965#ifdef CONFIG_SCHEDSTATS 1967#ifdef CONFIG_SCHEDSTATS
1966 if (p->se.wait_start) 1968 if (p->se.wait_start)
@@ -2021,6 +2023,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2021} 2023}
2022 2024
2023/* 2025/*
2026 * wait_task_context_switch - wait for a thread to complete at least one
2027 * context switch.
2028 *
2029 * @p must not be current.
2030 */
2031void wait_task_context_switch(struct task_struct *p)
2032{
2033 unsigned long nvcsw, nivcsw, flags;
2034 int running;
2035 struct rq *rq;
2036
2037 nvcsw = p->nvcsw;
2038 nivcsw = p->nivcsw;
2039 for (;;) {
2040 /*
2041 * The runqueue is assigned before the actual context
2042 * switch. We need to take the runqueue lock.
2043 *
2044 * We could check initially without the lock but it is
2045 * very likely that we need to take the lock in every
2046 * iteration.
2047 */
2048 rq = task_rq_lock(p, &flags);
2049 running = task_running(rq, p);
2050 task_rq_unlock(rq, &flags);
2051
2052 if (likely(!running))
2053 break;
2054 /*
2055 * The switch count is incremented before the actual
2056 * context switch. We thus wait for two switches to be
2057 * sure at least one completed.
2058 */
2059 if ((p->nvcsw - nvcsw) > 1)
2060 break;
2061 if ((p->nivcsw - nivcsw) > 1)
2062 break;
2063
2064 cpu_relax();
2065 }
2066}
2067
2068/*
2024 * wait_task_inactive - wait for a thread to unschedule. 2069 * wait_task_inactive - wait for a thread to unschedule.
2025 * 2070 *
2026 * If @match_state is nonzero, it's the @p->state value just checked and 2071 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2485,6 +2530,17 @@ out:
2485 return success; 2530 return success;
2486} 2531}
2487 2532
2533/**
2534 * wake_up_process - Wake up a specific process
2535 * @p: The process to be woken up.
2536 *
2537 * Attempt to wake up the nominated process and move it to the set of runnable
2538 * processes. Returns 1 if the process was woken up, 0 if it was already
2539 * running.
2540 *
2541 * It may be assumed that this function implies a write memory barrier before
2542 * changing the task state if and only if any tasks are woken up.
2543 */
2488int wake_up_process(struct task_struct *p) 2544int wake_up_process(struct task_struct *p)
2489{ 2545{
2490 return try_to_wake_up(p, TASK_ALL, 0); 2546 return try_to_wake_up(p, TASK_ALL, 0);
@@ -2795,7 +2851,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2795 * combine the page table reload and the switch backend into 2851 * combine the page table reload and the switch backend into
2796 * one hypercall. 2852 * one hypercall.
2797 */ 2853 */
2798 arch_enter_lazy_cpu_mode(); 2854 arch_start_context_switch(prev);
2799 2855
2800 if (unlikely(!mm)) { 2856 if (unlikely(!mm)) {
2801 next->active_mm = oldmm; 2857 next->active_mm = oldmm;
@@ -2885,19 +2941,72 @@ unsigned long nr_iowait(void)
2885 return sum; 2941 return sum;
2886} 2942}
2887 2943
2888unsigned long nr_active(void) 2944/* Variables and functions for calc_load */
2945static atomic_long_t calc_load_tasks;
2946static unsigned long calc_load_update;
2947unsigned long avenrun[3];
2948EXPORT_SYMBOL(avenrun);
2949
2950/**
2951 * get_avenrun - get the load average array
2952 * @loads: pointer to dest load array
2953 * @offset: offset to add
2954 * @shift: shift count to shift the result left
2955 *
2956 * These values are estimates at best, so no need for locking.
2957 */
2958void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2959{
2960 loads[0] = (avenrun[0] + offset) << shift;
2961 loads[1] = (avenrun[1] + offset) << shift;
2962 loads[2] = (avenrun[2] + offset) << shift;
2963}
2964
2965static unsigned long
2966calc_load(unsigned long load, unsigned long exp, unsigned long active)
2889{ 2967{
2890 unsigned long i, running = 0, uninterruptible = 0; 2968 load *= exp;
2969 load += active * (FIXED_1 - exp);
2970 return load >> FSHIFT;
2971}
2891 2972
2892 for_each_online_cpu(i) { 2973/*
2893 running += cpu_rq(i)->nr_running; 2974 * calc_load - update the avenrun load estimates 10 ticks after the
2894 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2975 * CPUs have updated calc_load_tasks.
2895 } 2976 */
2977void calc_global_load(void)
2978{
2979 unsigned long upd = calc_load_update + 10;
2980 long active;
2896 2981
2897 if (unlikely((long)uninterruptible < 0)) 2982 if (time_before(jiffies, upd))
2898 uninterruptible = 0; 2983 return;
2899 2984
2900 return running + uninterruptible; 2985 active = atomic_long_read(&calc_load_tasks);
2986 active = active > 0 ? active * FIXED_1 : 0;
2987
2988 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2989 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2990 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2991
2992 calc_load_update += LOAD_FREQ;
2993}
2994
2995/*
2996 * Either called from update_cpu_load() or from a cpu going idle
2997 */
2998static void calc_load_account_active(struct rq *this_rq)
2999{
3000 long nr_active, delta;
3001
3002 nr_active = this_rq->nr_running;
3003 nr_active += (long) this_rq->nr_uninterruptible;
3004
3005 if (nr_active != this_rq->calc_load_active) {
3006 delta = nr_active - this_rq->calc_load_active;
3007 this_rq->calc_load_active = nr_active;
3008 atomic_long_add(delta, &calc_load_tasks);
3009 }
2901} 3010}
2902 3011
2903/* 3012/*
@@ -2937,6 +3046,11 @@ static void update_cpu_load(struct rq *this_rq)
2937 new_load += scale-1; 3046 new_load += scale-1;
2938 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3047 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2939 } 3048 }
3049
3050 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3051 this_rq->calc_load_update += LOAD_FREQ;
3052 calc_load_account_active(this_rq);
3053 }
2940} 3054}
2941 3055
2942#ifdef CONFIG_SMP 3056#ifdef CONFIG_SMP
@@ -4278,10 +4392,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4278static struct { 4392static struct {
4279 atomic_t load_balancer; 4393 atomic_t load_balancer;
4280 cpumask_var_t cpu_mask; 4394 cpumask_var_t cpu_mask;
4395 cpumask_var_t ilb_grp_nohz_mask;
4281} nohz ____cacheline_aligned = { 4396} nohz ____cacheline_aligned = {
4282 .load_balancer = ATOMIC_INIT(-1), 4397 .load_balancer = ATOMIC_INIT(-1),
4283}; 4398};
4284 4399
4400#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4401/**
4402 * lowest_flag_domain - Return lowest sched_domain containing flag.
4403 * @cpu: The cpu whose lowest level of sched domain is to
4404 * be returned.
4405 * @flag: The flag to check for the lowest sched_domain
4406 * for the given cpu.
4407 *
4408 * Returns the lowest sched_domain of a cpu which contains the given flag.
4409 */
4410static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4411{
4412 struct sched_domain *sd;
4413
4414 for_each_domain(cpu, sd)
4415 if (sd && (sd->flags & flag))
4416 break;
4417
4418 return sd;
4419}
4420
4421/**
4422 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4423 * @cpu: The cpu whose domains we're iterating over.
4424 * @sd: variable holding the value of the power_savings_sd
4425 * for cpu.
4426 * @flag: The flag to filter the sched_domains to be iterated.
4427 *
4428 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4429 * set, starting from the lowest sched_domain to the highest.
4430 */
4431#define for_each_flag_domain(cpu, sd, flag) \
4432 for (sd = lowest_flag_domain(cpu, flag); \
4433 (sd && (sd->flags & flag)); sd = sd->parent)
4434
4435/**
4436 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4437 * @ilb_group: group to be checked for semi-idleness
4438 *
4439 * Returns: 1 if the group is semi-idle. 0 otherwise.
4440 *
4441 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4442 * and atleast one non-idle CPU. This helper function checks if the given
4443 * sched_group is semi-idle or not.
4444 */
4445static inline int is_semi_idle_group(struct sched_group *ilb_group)
4446{
4447 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4448 sched_group_cpus(ilb_group));
4449
4450 /*
4451 * A sched_group is semi-idle when it has atleast one busy cpu
4452 * and atleast one idle cpu.
4453 */
4454 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4455 return 0;
4456
4457 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4458 return 0;
4459
4460 return 1;
4461}
4462/**
4463 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4464 * @cpu: The cpu which is nominating a new idle_load_balancer.
4465 *
4466 * Returns: Returns the id of the idle load balancer if it exists,
4467 * Else, returns >= nr_cpu_ids.
4468 *
4469 * This algorithm picks the idle load balancer such that it belongs to a
4470 * semi-idle powersavings sched_domain. The idea is to try and avoid
4471 * completely idle packages/cores just for the purpose of idle load balancing
4472 * when there are other idle cpu's which are better suited for that job.
4473 */
4474static int find_new_ilb(int cpu)
4475{
4476 struct sched_domain *sd;
4477 struct sched_group *ilb_group;
4478
4479 /*
4480 * Have idle load balancer selection from semi-idle packages only
4481 * when power-aware load balancing is enabled
4482 */
4483 if (!(sched_smt_power_savings || sched_mc_power_savings))
4484 goto out_done;
4485
4486 /*
4487 * Optimize for the case when we have no idle CPUs or only one
4488 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4489 */
4490 if (cpumask_weight(nohz.cpu_mask) < 2)
4491 goto out_done;
4492
4493 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4494 ilb_group = sd->groups;
4495
4496 do {
4497 if (is_semi_idle_group(ilb_group))
4498 return cpumask_first(nohz.ilb_grp_nohz_mask);
4499
4500 ilb_group = ilb_group->next;
4501
4502 } while (ilb_group != sd->groups);
4503 }
4504
4505out_done:
4506 return cpumask_first(nohz.cpu_mask);
4507}
4508#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4509static inline int find_new_ilb(int call_cpu)
4510{
4511 return cpumask_first(nohz.cpu_mask);
4512}
4513#endif
4514
4285/* 4515/*
4286 * This routine will try to nominate the ilb (idle load balancing) 4516 * This routine will try to nominate the ilb (idle load balancing)
4287 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4517 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4336,8 +4566,24 @@ int select_nohz_load_balancer(int stop_tick)
4336 /* make me the ilb owner */ 4566 /* make me the ilb owner */
4337 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4567 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4338 return 1; 4568 return 1;
4339 } else if (atomic_read(&nohz.load_balancer) == cpu) 4569 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4570 int new_ilb;
4571
4572 if (!(sched_smt_power_savings ||
4573 sched_mc_power_savings))
4574 return 1;
4575 /*
4576 * Check to see if there is a more power-efficient
4577 * ilb.
4578 */
4579 new_ilb = find_new_ilb(cpu);
4580 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4581 atomic_set(&nohz.load_balancer, -1);
4582 resched_cpu(new_ilb);
4583 return 0;
4584 }
4340 return 1; 4585 return 1;
4586 }
4341 } else { 4587 } else {
4342 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4588 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4343 return 0; 4589 return 0;
@@ -4506,15 +4752,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4506 } 4752 }
4507 4753
4508 if (atomic_read(&nohz.load_balancer) == -1) { 4754 if (atomic_read(&nohz.load_balancer) == -1) {
4509 /* 4755 int ilb = find_new_ilb(cpu);
4510 * simple selection for now: Nominate the
4511 * first cpu in the nohz list to be the next
4512 * ilb owner.
4513 *
4514 * TBD: Traverse the sched domains and nominate
4515 * the nearest cpu in the nohz.cpu_mask.
4516 */
4517 int ilb = cpumask_first(nohz.cpu_mask);
4518 4756
4519 if (ilb < nr_cpu_ids) 4757 if (ilb < nr_cpu_ids)
4520 resched_cpu(ilb); 4758 resched_cpu(ilb);
@@ -5047,13 +5285,15 @@ pick_next_task(struct rq *rq)
5047/* 5285/*
5048 * schedule() is the main scheduler function. 5286 * schedule() is the main scheduler function.
5049 */ 5287 */
5050asmlinkage void __sched __schedule(void) 5288asmlinkage void __sched schedule(void)
5051{ 5289{
5052 struct task_struct *prev, *next; 5290 struct task_struct *prev, *next;
5053 unsigned long *switch_count; 5291 unsigned long *switch_count;
5054 struct rq *rq; 5292 struct rq *rq;
5055 int cpu; 5293 int cpu;
5056 5294
5295need_resched:
5296 preempt_disable();
5057 cpu = smp_processor_id(); 5297 cpu = smp_processor_id();
5058 rq = cpu_rq(cpu); 5298 rq = cpu_rq(cpu);
5059 rcu_qsctr_inc(cpu); 5299 rcu_qsctr_inc(cpu);
@@ -5111,15 +5351,9 @@ need_resched_nonpreemptible:
5111 5351
5112 if (unlikely(reacquire_kernel_lock(current) < 0)) 5352 if (unlikely(reacquire_kernel_lock(current) < 0))
5113 goto need_resched_nonpreemptible; 5353 goto need_resched_nonpreemptible;
5114}
5115 5354
5116asmlinkage void __sched schedule(void)
5117{
5118need_resched:
5119 preempt_disable();
5120 __schedule();
5121 preempt_enable_no_resched(); 5355 preempt_enable_no_resched();
5122 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5356 if (need_resched())
5123 goto need_resched; 5357 goto need_resched;
5124} 5358}
5125EXPORT_SYMBOL(schedule); 5359EXPORT_SYMBOL(schedule);
@@ -5262,7 +5496,7 @@ EXPORT_SYMBOL(default_wake_function);
5262 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5496 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5263 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5497 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5264 */ 5498 */
5265void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5499static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5266 int nr_exclusive, int sync, void *key) 5500 int nr_exclusive, int sync, void *key)
5267{ 5501{
5268 wait_queue_t *curr, *next; 5502 wait_queue_t *curr, *next;
@@ -5282,6 +5516,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5282 * @mode: which threads 5516 * @mode: which threads
5283 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5517 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5284 * @key: is directly passed to the wakeup function 5518 * @key: is directly passed to the wakeup function
5519 *
5520 * It may be assumed that this function implies a write memory barrier before
5521 * changing the task state if and only if any tasks are woken up.
5285 */ 5522 */
5286void __wake_up(wait_queue_head_t *q, unsigned int mode, 5523void __wake_up(wait_queue_head_t *q, unsigned int mode,
5287 int nr_exclusive, void *key) 5524 int nr_exclusive, void *key)
@@ -5320,6 +5557,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5320 * with each other. This can prevent needless bouncing between CPUs. 5557 * with each other. This can prevent needless bouncing between CPUs.
5321 * 5558 *
5322 * On UP it can prevent extra preemption. 5559 * On UP it can prevent extra preemption.
5560 *
5561 * It may be assumed that this function implies a write memory barrier before
5562 * changing the task state if and only if any tasks are woken up.
5323 */ 5563 */
5324void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 5564void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5325 int nr_exclusive, void *key) 5565 int nr_exclusive, void *key)
@@ -5356,6 +5596,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5356 * awakened in the same order in which they were queued. 5596 * awakened in the same order in which they were queued.
5357 * 5597 *
5358 * See also complete_all(), wait_for_completion() and related routines. 5598 * See also complete_all(), wait_for_completion() and related routines.
5599 *
5600 * It may be assumed that this function implies a write memory barrier before
5601 * changing the task state if and only if any tasks are woken up.
5359 */ 5602 */
5360void complete(struct completion *x) 5603void complete(struct completion *x)
5361{ 5604{
@@ -5373,6 +5616,9 @@ EXPORT_SYMBOL(complete);
5373 * @x: holds the state of this particular completion 5616 * @x: holds the state of this particular completion
5374 * 5617 *
5375 * This will wake up all threads waiting on this particular completion event. 5618 * This will wake up all threads waiting on this particular completion event.
5619 *
5620 * It may be assumed that this function implies a write memory barrier before
5621 * changing the task state if and only if any tasks are woken up.
5376 */ 5622 */
5377void complete_all(struct completion *x) 5623void complete_all(struct completion *x)
5378{ 5624{
@@ -6531,8 +6777,9 @@ void sched_show_task(struct task_struct *p)
6531#ifdef CONFIG_DEBUG_STACK_USAGE 6777#ifdef CONFIG_DEBUG_STACK_USAGE
6532 free = stack_not_used(p); 6778 free = stack_not_used(p);
6533#endif 6779#endif
6534 printk(KERN_CONT "%5lu %5d %6d\n", free, 6780 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6535 task_pid_nr(p), task_pid_nr(p->real_parent)); 6781 task_pid_nr(p), task_pid_nr(p->real_parent),
6782 (unsigned long)task_thread_info(p)->flags);
6536 6783
6537 show_stack(p, NULL); 6784 show_stack(p, NULL);
6538} 6785}
@@ -7011,6 +7258,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7011 7258
7012 } 7259 }
7013} 7260}
7261
7262/*
7263 * remove the tasks which were accounted by rq from calc_load_tasks.
7264 */
7265static void calc_global_load_remove(struct rq *rq)
7266{
7267 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7268}
7014#endif /* CONFIG_HOTPLUG_CPU */ 7269#endif /* CONFIG_HOTPLUG_CPU */
7015 7270
7016#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7271#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7245,6 +7500,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7245 /* Update our root-domain */ 7500 /* Update our root-domain */
7246 rq = cpu_rq(cpu); 7501 rq = cpu_rq(cpu);
7247 spin_lock_irqsave(&rq->lock, flags); 7502 spin_lock_irqsave(&rq->lock, flags);
7503 rq->calc_load_update = calc_load_update;
7504 rq->calc_load_active = 0;
7248 if (rq->rd) { 7505 if (rq->rd) {
7249 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7506 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7250 7507
@@ -7284,7 +7541,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7284 cpuset_unlock(); 7541 cpuset_unlock();
7285 migrate_nr_uninterruptible(rq); 7542 migrate_nr_uninterruptible(rq);
7286 BUG_ON(rq->nr_running != 0); 7543 BUG_ON(rq->nr_running != 0);
7287 7544 calc_global_load_remove(rq);
7288 /* 7545 /*
7289 * No need to migrate the tasks: it was best-effort if 7546 * No need to migrate the tasks: it was best-effort if
7290 * they didn't take sched_hotcpu_mutex. Just wake up 7547 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7796,8 +8053,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7796 8053
7797/* 8054/*
7798 * The cpus mask in sched_group and sched_domain hangs off the end. 8055 * The cpus mask in sched_group and sched_domain hangs off the end.
7799 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 8056 *
7800 * for nr_cpu_ids < CONFIG_NR_CPUS. 8057 * ( See the the comments in include/linux/sched.h:struct sched_group
8058 * and struct sched_domain. )
7801 */ 8059 */
7802struct static_sched_group { 8060struct static_sched_group {
7803 struct sched_group sg; 8061 struct sched_group sg;
@@ -7918,7 +8176,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7918 struct sched_domain *sd; 8176 struct sched_domain *sd;
7919 8177
7920 sd = &per_cpu(phys_domains, j).sd; 8178 sd = &per_cpu(phys_domains, j).sd;
7921 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8179 if (j != group_first_cpu(sd->groups)) {
7922 /* 8180 /*
7923 * Only add "power" once for each 8181 * Only add "power" once for each
7924 * physical package. 8182 * physical package.
@@ -7996,7 +8254,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7996 8254
7997 WARN_ON(!sd || !sd->groups); 8255 WARN_ON(!sd || !sd->groups);
7998 8256
7999 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8257 if (cpu != group_first_cpu(sd->groups))
8000 return; 8258 return;
8001 8259
8002 child = sd->child; 8260 child = sd->child;
@@ -8981,6 +9239,8 @@ void __init sched_init(void)
8981 rq = cpu_rq(i); 9239 rq = cpu_rq(i);
8982 spin_lock_init(&rq->lock); 9240 spin_lock_init(&rq->lock);
8983 rq->nr_running = 0; 9241 rq->nr_running = 0;
9242 rq->calc_load_active = 0;
9243 rq->calc_load_update = jiffies + LOAD_FREQ;
8984 init_cfs_rq(&rq->cfs, rq); 9244 init_cfs_rq(&rq->cfs, rq);
8985 init_rt_rq(&rq->rt, rq); 9245 init_rt_rq(&rq->rt, rq);
8986#ifdef CONFIG_FAIR_GROUP_SCHED 9246#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9088,6 +9348,9 @@ void __init sched_init(void)
9088 * when this runqueue becomes "idle". 9348 * when this runqueue becomes "idle".
9089 */ 9349 */
9090 init_idle(current, smp_processor_id()); 9350 init_idle(current, smp_processor_id());
9351
9352 calc_load_update = jiffies + LOAD_FREQ;
9353
9091 /* 9354 /*
9092 * During early bootup we pretend to be a normal task: 9355 * During early bootup we pretend to be a normal task:
9093 */ 9356 */
@@ -9098,6 +9361,7 @@ void __init sched_init(void)
9098#ifdef CONFIG_SMP 9361#ifdef CONFIG_SMP
9099#ifdef CONFIG_NO_HZ 9362#ifdef CONFIG_NO_HZ
9100 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9363 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
9364 alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
9101#endif 9365#endif
9102 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9366 alloc_bootmem_cpumask_var(&cpu_isolated_map);
9103#endif /* SMP */ 9367#endif /* SMP */
@@ -9845,6 +10109,13 @@ static int sched_rt_global_constraints(void)
9845 if (sysctl_sched_rt_period <= 0) 10109 if (sysctl_sched_rt_period <= 0)
9846 return -EINVAL; 10110 return -EINVAL;
9847 10111
10112 /*
10113 * There's always some RT tasks in the root group
10114 * -- migration, kstopmachine etc..
10115 */
10116 if (sysctl_sched_rt_runtime == 0)
10117 return -EBUSY;
10118
9848 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10119 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9849 for_each_possible_cpu(i) { 10120 for_each_possible_cpu(i) {
9850 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10121 struct rt_rq *rt_rq = &cpu_rq(i)->rt;