aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c279
1 files changed, 244 insertions, 35 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..228acae8821f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
630 struct list_head migration_queue; 630 struct list_head migration_queue;
631#endif 631#endif
632 632
633 /* calc_load related fields */
634 unsigned long calc_load_update;
635 long calc_load_active;
636
633#ifdef CONFIG_SCHED_HRTICK 637#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 638#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 639 int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1732}
1729#endif 1733#endif
1730 1734
1735static void calc_load_account_active(struct rq *this_rq);
1736
1731#include "sched_stats.h" 1737#include "sched_stats.h"
1732#include "sched_idletask.c" 1738#include "sched_idletask.c"
1733#include "sched_fair.c" 1739#include "sched_fair.c"
@@ -2856,19 +2862,72 @@ unsigned long nr_iowait(void)
2856 return sum; 2862 return sum;
2857} 2863}
2858 2864
2859unsigned long nr_active(void) 2865/* Variables and functions for calc_load */
2866static atomic_long_t calc_load_tasks;
2867static unsigned long calc_load_update;
2868unsigned long avenrun[3];
2869EXPORT_SYMBOL(avenrun);
2870
2871/**
2872 * get_avenrun - get the load average array
2873 * @loads: pointer to dest load array
2874 * @offset: offset to add
2875 * @shift: shift count to shift the result left
2876 *
2877 * These values are estimates at best, so no need for locking.
2878 */
2879void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2860{ 2880{
2861 unsigned long i, running = 0, uninterruptible = 0; 2881 loads[0] = (avenrun[0] + offset) << shift;
2882 loads[1] = (avenrun[1] + offset) << shift;
2883 loads[2] = (avenrun[2] + offset) << shift;
2884}
2862 2885
2863 for_each_online_cpu(i) { 2886static unsigned long
2864 running += cpu_rq(i)->nr_running; 2887calc_load(unsigned long load, unsigned long exp, unsigned long active)
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2888{
2866 } 2889 load *= exp;
2890 load += active * (FIXED_1 - exp);
2891 return load >> FSHIFT;
2892}
2867 2893
2868 if (unlikely((long)uninterruptible < 0)) 2894/*
2869 uninterruptible = 0; 2895 * calc_load - update the avenrun load estimates 10 ticks after the
2896 * CPUs have updated calc_load_tasks.
2897 */
2898void calc_global_load(void)
2899{
2900 unsigned long upd = calc_load_update + 10;
2901 long active;
2870 2902
2871 return running + uninterruptible; 2903 if (time_before(jiffies, upd))
2904 return;
2905
2906 active = atomic_long_read(&calc_load_tasks);
2907 active = active > 0 ? active * FIXED_1 : 0;
2908
2909 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2910 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2911 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2912
2913 calc_load_update += LOAD_FREQ;
2914}
2915
2916/*
2917 * Either called from update_cpu_load() or from a cpu going idle
2918 */
2919static void calc_load_account_active(struct rq *this_rq)
2920{
2921 long nr_active, delta;
2922
2923 nr_active = this_rq->nr_running;
2924 nr_active += (long) this_rq->nr_uninterruptible;
2925
2926 if (nr_active != this_rq->calc_load_active) {
2927 delta = nr_active - this_rq->calc_load_active;
2928 this_rq->calc_load_active = nr_active;
2929 atomic_long_add(delta, &calc_load_tasks);
2930 }
2872} 2931}
2873 2932
2874/* 2933/*
@@ -2899,6 +2958,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 2958 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 2959 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 2960 }
2961
2962 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
2963 this_rq->calc_load_update += LOAD_FREQ;
2964 calc_load_account_active(this_rq);
2965 }
2902} 2966}
2903 2967
2904#ifdef CONFIG_SMP 2968#ifdef CONFIG_SMP
@@ -4240,10 +4304,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4304static struct {
4241 atomic_t load_balancer; 4305 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4306 cpumask_var_t cpu_mask;
4307 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4308} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4309 .load_balancer = ATOMIC_INIT(-1),
4245}; 4310};
4246 4311
4312#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4313/**
4314 * lowest_flag_domain - Return lowest sched_domain containing flag.
4315 * @cpu: The cpu whose lowest level of sched domain is to
4316 * be returned.
4317 * @flag: The flag to check for the lowest sched_domain
4318 * for the given cpu.
4319 *
4320 * Returns the lowest sched_domain of a cpu which contains the given flag.
4321 */
4322static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4323{
4324 struct sched_domain *sd;
4325
4326 for_each_domain(cpu, sd)
4327 if (sd && (sd->flags & flag))
4328 break;
4329
4330 return sd;
4331}
4332
4333/**
4334 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4335 * @cpu: The cpu whose domains we're iterating over.
4336 * @sd: variable holding the value of the power_savings_sd
4337 * for cpu.
4338 * @flag: The flag to filter the sched_domains to be iterated.
4339 *
4340 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4341 * set, starting from the lowest sched_domain to the highest.
4342 */
4343#define for_each_flag_domain(cpu, sd, flag) \
4344 for (sd = lowest_flag_domain(cpu, flag); \
4345 (sd && (sd->flags & flag)); sd = sd->parent)
4346
4347/**
4348 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4349 * @ilb_group: group to be checked for semi-idleness
4350 *
4351 * Returns: 1 if the group is semi-idle. 0 otherwise.
4352 *
4353 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4354 * and atleast one non-idle CPU. This helper function checks if the given
4355 * sched_group is semi-idle or not.
4356 */
4357static inline int is_semi_idle_group(struct sched_group *ilb_group)
4358{
4359 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4360 sched_group_cpus(ilb_group));
4361
4362 /*
4363 * A sched_group is semi-idle when it has atleast one busy cpu
4364 * and atleast one idle cpu.
4365 */
4366 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4367 return 0;
4368
4369 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4370 return 0;
4371
4372 return 1;
4373}
4374/**
4375 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4376 * @cpu: The cpu which is nominating a new idle_load_balancer.
4377 *
4378 * Returns: Returns the id of the idle load balancer if it exists,
4379 * Else, returns >= nr_cpu_ids.
4380 *
4381 * This algorithm picks the idle load balancer such that it belongs to a
4382 * semi-idle powersavings sched_domain. The idea is to try and avoid
4383 * completely idle packages/cores just for the purpose of idle load balancing
4384 * when there are other idle cpu's which are better suited for that job.
4385 */
4386static int find_new_ilb(int cpu)
4387{
4388 struct sched_domain *sd;
4389 struct sched_group *ilb_group;
4390
4391 /*
4392 * Have idle load balancer selection from semi-idle packages only
4393 * when power-aware load balancing is enabled
4394 */
4395 if (!(sched_smt_power_savings || sched_mc_power_savings))
4396 goto out_done;
4397
4398 /*
4399 * Optimize for the case when we have no idle CPUs or only one
4400 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4401 */
4402 if (cpumask_weight(nohz.cpu_mask) < 2)
4403 goto out_done;
4404
4405 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4406 ilb_group = sd->groups;
4407
4408 do {
4409 if (is_semi_idle_group(ilb_group))
4410 return cpumask_first(nohz.ilb_grp_nohz_mask);
4411
4412 ilb_group = ilb_group->next;
4413
4414 } while (ilb_group != sd->groups);
4415 }
4416
4417out_done:
4418 return cpumask_first(nohz.cpu_mask);
4419}
4420#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4421static inline int find_new_ilb(int call_cpu)
4422{
4423 return cpumask_first(nohz.cpu_mask);
4424}
4425#endif
4426
4247/* 4427/*
4248 * This routine will try to nominate the ilb (idle load balancing) 4428 * This routine will try to nominate the ilb (idle load balancing)
4249 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4429 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4478,24 @@ int select_nohz_load_balancer(int stop_tick)
4298 /* make me the ilb owner */ 4478 /* make me the ilb owner */
4299 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4479 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300 return 1; 4480 return 1;
4301 } else if (atomic_read(&nohz.load_balancer) == cpu) 4481 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4482 int new_ilb;
4483
4484 if (!(sched_smt_power_savings ||
4485 sched_mc_power_savings))
4486 return 1;
4487 /*
4488 * Check to see if there is a more power-efficient
4489 * ilb.
4490 */
4491 new_ilb = find_new_ilb(cpu);
4492 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4493 atomic_set(&nohz.load_balancer, -1);
4494 resched_cpu(new_ilb);
4495 return 0;
4496 }
4302 return 1; 4497 return 1;
4498 }
4303 } else { 4499 } else {
4304 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4500 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305 return 0; 4501 return 0;
@@ -4468,15 +4664,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468 } 4664 }
4469 4665
4470 if (atomic_read(&nohz.load_balancer) == -1) { 4666 if (atomic_read(&nohz.load_balancer) == -1) {
4471 /* 4667 int ilb = find_new_ilb(cpu);
4472 * simple selection for now: Nominate the
4473 * first cpu in the nohz list to be the next
4474 * ilb owner.
4475 *
4476 * TBD: Traverse the sched domains and nominate
4477 * the nearest cpu in the nohz.cpu_mask.
4478 */
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480 4668
4481 if (ilb < nr_cpu_ids) 4669 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb); 4670 resched_cpu(ilb);
@@ -5007,13 +5195,15 @@ pick_next_task(struct rq *rq)
5007/* 5195/*
5008 * schedule() is the main scheduler function. 5196 * schedule() is the main scheduler function.
5009 */ 5197 */
5010asmlinkage void __sched __schedule(void) 5198asmlinkage void __sched schedule(void)
5011{ 5199{
5012 struct task_struct *prev, *next; 5200 struct task_struct *prev, *next;
5013 unsigned long *switch_count; 5201 unsigned long *switch_count;
5014 struct rq *rq; 5202 struct rq *rq;
5015 int cpu; 5203 int cpu;
5016 5204
5205need_resched:
5206 preempt_disable();
5017 cpu = smp_processor_id(); 5207 cpu = smp_processor_id();
5018 rq = cpu_rq(cpu); 5208 rq = cpu_rq(cpu);
5019 rcu_qsctr_inc(cpu); 5209 rcu_qsctr_inc(cpu);
@@ -5070,15 +5260,9 @@ need_resched_nonpreemptible:
5070 5260
5071 if (unlikely(reacquire_kernel_lock(current) < 0)) 5261 if (unlikely(reacquire_kernel_lock(current) < 0))
5072 goto need_resched_nonpreemptible; 5262 goto need_resched_nonpreemptible;
5073}
5074 5263
5075asmlinkage void __sched schedule(void)
5076{
5077need_resched:
5078 preempt_disable();
5079 __schedule();
5080 preempt_enable_no_resched(); 5264 preempt_enable_no_resched();
5081 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5265 if (need_resched())
5082 goto need_resched; 5266 goto need_resched;
5083} 5267}
5084EXPORT_SYMBOL(schedule); 5268EXPORT_SYMBOL(schedule);
@@ -5221,7 +5405,7 @@ EXPORT_SYMBOL(default_wake_function);
5221 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5405 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5222 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5406 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5223 */ 5407 */
5224void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5408static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5225 int nr_exclusive, int sync, void *key) 5409 int nr_exclusive, int sync, void *key)
5226{ 5410{
5227 wait_queue_t *curr, *next; 5411 wait_queue_t *curr, *next;
@@ -6490,8 +6674,9 @@ void sched_show_task(struct task_struct *p)
6490#ifdef CONFIG_DEBUG_STACK_USAGE 6674#ifdef CONFIG_DEBUG_STACK_USAGE
6491 free = stack_not_used(p); 6675 free = stack_not_used(p);
6492#endif 6676#endif
6493 printk(KERN_CONT "%5lu %5d %6d\n", free, 6677 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6494 task_pid_nr(p), task_pid_nr(p->real_parent)); 6678 task_pid_nr(p), task_pid_nr(p->real_parent),
6679 (unsigned long)task_thread_info(p)->flags);
6495 6680
6496 show_stack(p, NULL); 6681 show_stack(p, NULL);
6497} 6682}
@@ -6970,6 +7155,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6970 7155
6971 } 7156 }
6972} 7157}
7158
7159/*
7160 * remove the tasks which were accounted by rq from calc_load_tasks.
7161 */
7162static void calc_global_load_remove(struct rq *rq)
7163{
7164 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7165}
6973#endif /* CONFIG_HOTPLUG_CPU */ 7166#endif /* CONFIG_HOTPLUG_CPU */
6974 7167
6975#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7168#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7397,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7204 /* Update our root-domain */ 7397 /* Update our root-domain */
7205 rq = cpu_rq(cpu); 7398 rq = cpu_rq(cpu);
7206 spin_lock_irqsave(&rq->lock, flags); 7399 spin_lock_irqsave(&rq->lock, flags);
7400 rq->calc_load_update = calc_load_update;
7401 rq->calc_load_active = 0;
7207 if (rq->rd) { 7402 if (rq->rd) {
7208 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7403 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7209 7404
@@ -7243,7 +7438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7243 cpuset_unlock(); 7438 cpuset_unlock();
7244 migrate_nr_uninterruptible(rq); 7439 migrate_nr_uninterruptible(rq);
7245 BUG_ON(rq->nr_running != 0); 7440 BUG_ON(rq->nr_running != 0);
7246 7441 calc_global_load_remove(rq);
7247 /* 7442 /*
7248 * No need to migrate the tasks: it was best-effort if 7443 * No need to migrate the tasks: it was best-effort if
7249 * they didn't take sched_hotcpu_mutex. Just wake up 7444 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7753,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7753 7948
7754/* 7949/*
7755 * The cpus mask in sched_group and sched_domain hangs off the end. 7950 * The cpus mask in sched_group and sched_domain hangs off the end.
7756 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 7951 *
7757 * for nr_cpu_ids < CONFIG_NR_CPUS. 7952 * ( See the the comments in include/linux/sched.h:struct sched_group
7953 * and struct sched_domain. )
7758 */ 7954 */
7759struct static_sched_group { 7955struct static_sched_group {
7760 struct sched_group sg; 7956 struct sched_group sg;
@@ -7875,7 +8071,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7875 struct sched_domain *sd; 8071 struct sched_domain *sd;
7876 8072
7877 sd = &per_cpu(phys_domains, j).sd; 8073 sd = &per_cpu(phys_domains, j).sd;
7878 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8074 if (j != group_first_cpu(sd->groups)) {
7879 /* 8075 /*
7880 * Only add "power" once for each 8076 * Only add "power" once for each
7881 * physical package. 8077 * physical package.
@@ -7953,7 +8149,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7953 8149
7954 WARN_ON(!sd || !sd->groups); 8150 WARN_ON(!sd || !sd->groups);
7955 8151
7956 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8152 if (cpu != group_first_cpu(sd->groups))
7957 return; 8153 return;
7958 8154
7959 child = sd->child; 8155 child = sd->child;
@@ -8938,6 +9134,8 @@ void __init sched_init(void)
8938 rq = cpu_rq(i); 9134 rq = cpu_rq(i);
8939 spin_lock_init(&rq->lock); 9135 spin_lock_init(&rq->lock);
8940 rq->nr_running = 0; 9136 rq->nr_running = 0;
9137 rq->calc_load_active = 0;
9138 rq->calc_load_update = jiffies + LOAD_FREQ;
8941 init_cfs_rq(&rq->cfs, rq); 9139 init_cfs_rq(&rq->cfs, rq);
8942 init_rt_rq(&rq->rt, rq); 9140 init_rt_rq(&rq->rt, rq);
8943#ifdef CONFIG_FAIR_GROUP_SCHED 9141#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9045,6 +9243,9 @@ void __init sched_init(void)
9045 * when this runqueue becomes "idle". 9243 * when this runqueue becomes "idle".
9046 */ 9244 */
9047 init_idle(current, smp_processor_id()); 9245 init_idle(current, smp_processor_id());
9246
9247 calc_load_update = jiffies + LOAD_FREQ;
9248
9048 /* 9249 /*
9049 * During early bootup we pretend to be a normal task: 9250 * During early bootup we pretend to be a normal task:
9050 */ 9251 */
@@ -9055,6 +9256,7 @@ void __init sched_init(void)
9055#ifdef CONFIG_SMP 9256#ifdef CONFIG_SMP
9056#ifdef CONFIG_NO_HZ 9257#ifdef CONFIG_NO_HZ
9057 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9258 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
9259 alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
9058#endif 9260#endif
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9261 alloc_bootmem_cpumask_var(&cpu_isolated_map);
9060#endif /* SMP */ 9262#endif /* SMP */
@@ -9800,6 +10002,13 @@ static int sched_rt_global_constraints(void)
9800 if (sysctl_sched_rt_period <= 0) 10002 if (sysctl_sched_rt_period <= 0)
9801 return -EINVAL; 10003 return -EINVAL;
9802 10004
10005 /*
10006 * There's always some RT tasks in the root group
10007 * -- migration, kstopmachine etc..
10008 */
10009 if (sysctl_sched_rt_runtime == 0)
10010 return -EBUSY;
10011
9803 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10012 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9804 for_each_possible_cpu(i) { 10013 for_each_possible_cpu(i) {
9805 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10014 struct rt_rq *rt_rq = &cpu_rq(i)->rt;