aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-06-10 18:32:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-10 18:32:59 -0400
commit99e97b860e14c64760855198e91d1166697131a7 (patch)
treefadc8368c3f784bff92fba82d983e7861559cf9d /kernel
parent82782ca77d1bfb32b0334cce40a25b91bd8ec016 (diff)
parentf04d82b7e0c63d0251f9952a537a4bc4d73aa1a9 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: sched: fix typo in sched-rt-group.txt file ftrace: fix typo about map of kernel priority in ftrace.txt file. sched: properly define the sched_group::cpumask and sched_domain::span fields sched, timers: cleanup avenrun users sched, timers: move calc_load() to scheduler sched: Don't export sched_mc_power_savings on multi-socket single core system sched: emit thread info flags with stack trace sched: rt: document the risk of small values in the bandwidth settings sched: Replace first_cpu() with cpumask_first() in ILB nomination code sched: remove extra call overhead for schedule() sched: use group_first_cpu() instead of cpumask_first(sched_group_cpus()) wait: don't use __wake_up_common() sched: Nominate a power-efficient ilb in select_nohz_balancer() sched: Nominate idle load balancer from a semi-idle package. sched: remove redundant hierarchy walk in check_preempt_wakeup
Diffstat (limited to 'kernel')
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/sched.c279
-rw-r--r--kernel/sched_fair.c13
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/timer.c86
-rw-r--r--kernel/wait.c2
7 files changed, 262 insertions, 127 deletions
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..6ca5fe96e393 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
249 249
250 /* didnt get the lock, go to sleep: */ 250 /* didnt get the lock, go to sleep: */
251 spin_unlock_mutex(&lock->wait_lock, flags); 251 spin_unlock_mutex(&lock->wait_lock, flags);
252 __schedule(); 252 preempt_enable_no_resched();
253 schedule();
254 preempt_disable();
253 spin_lock_mutex(&lock->wait_lock, flags); 255 spin_lock_mutex(&lock->wait_lock, flags);
254 } 256 }
255 257
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..228acae8821f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
630 struct list_head migration_queue; 630 struct list_head migration_queue;
631#endif 631#endif
632 632
633 /* calc_load related fields */
634 unsigned long calc_load_update;
635 long calc_load_active;
636
633#ifdef CONFIG_SCHED_HRTICK 637#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 638#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 639 int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1732}
1729#endif 1733#endif
1730 1734
1735static void calc_load_account_active(struct rq *this_rq);
1736
1731#include "sched_stats.h" 1737#include "sched_stats.h"
1732#include "sched_idletask.c" 1738#include "sched_idletask.c"
1733#include "sched_fair.c" 1739#include "sched_fair.c"
@@ -2856,19 +2862,72 @@ unsigned long nr_iowait(void)
2856 return sum; 2862 return sum;
2857} 2863}
2858 2864
2859unsigned long nr_active(void) 2865/* Variables and functions for calc_load */
2866static atomic_long_t calc_load_tasks;
2867static unsigned long calc_load_update;
2868unsigned long avenrun[3];
2869EXPORT_SYMBOL(avenrun);
2870
2871/**
2872 * get_avenrun - get the load average array
2873 * @loads: pointer to dest load array
2874 * @offset: offset to add
2875 * @shift: shift count to shift the result left
2876 *
2877 * These values are estimates at best, so no need for locking.
2878 */
2879void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2860{ 2880{
2861 unsigned long i, running = 0, uninterruptible = 0; 2881 loads[0] = (avenrun[0] + offset) << shift;
2882 loads[1] = (avenrun[1] + offset) << shift;
2883 loads[2] = (avenrun[2] + offset) << shift;
2884}
2862 2885
2863 for_each_online_cpu(i) { 2886static unsigned long
2864 running += cpu_rq(i)->nr_running; 2887calc_load(unsigned long load, unsigned long exp, unsigned long active)
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2888{
2866 } 2889 load *= exp;
2890 load += active * (FIXED_1 - exp);
2891 return load >> FSHIFT;
2892}
2867 2893
2868 if (unlikely((long)uninterruptible < 0)) 2894/*
2869 uninterruptible = 0; 2895 * calc_load - update the avenrun load estimates 10 ticks after the
2896 * CPUs have updated calc_load_tasks.
2897 */
2898void calc_global_load(void)
2899{
2900 unsigned long upd = calc_load_update + 10;
2901 long active;
2870 2902
2871 return running + uninterruptible; 2903 if (time_before(jiffies, upd))
2904 return;
2905
2906 active = atomic_long_read(&calc_load_tasks);
2907 active = active > 0 ? active * FIXED_1 : 0;
2908
2909 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2910 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2911 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2912
2913 calc_load_update += LOAD_FREQ;
2914}
2915
2916/*
2917 * Either called from update_cpu_load() or from a cpu going idle
2918 */
2919static void calc_load_account_active(struct rq *this_rq)
2920{
2921 long nr_active, delta;
2922
2923 nr_active = this_rq->nr_running;
2924 nr_active += (long) this_rq->nr_uninterruptible;
2925
2926 if (nr_active != this_rq->calc_load_active) {
2927 delta = nr_active - this_rq->calc_load_active;
2928 this_rq->calc_load_active = nr_active;
2929 atomic_long_add(delta, &calc_load_tasks);
2930 }
2872} 2931}
2873 2932
2874/* 2933/*
@@ -2899,6 +2958,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 2958 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 2959 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 2960 }
2961
2962 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
2963 this_rq->calc_load_update += LOAD_FREQ;
2964 calc_load_account_active(this_rq);
2965 }
2902} 2966}
2903 2967
2904#ifdef CONFIG_SMP 2968#ifdef CONFIG_SMP
@@ -4240,10 +4304,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4304static struct {
4241 atomic_t load_balancer; 4305 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4306 cpumask_var_t cpu_mask;
4307 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4308} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4309 .load_balancer = ATOMIC_INIT(-1),
4245}; 4310};
4246 4311
4312#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4313/**
4314 * lowest_flag_domain - Return lowest sched_domain containing flag.
4315 * @cpu: The cpu whose lowest level of sched domain is to
4316 * be returned.
4317 * @flag: The flag to check for the lowest sched_domain
4318 * for the given cpu.
4319 *
4320 * Returns the lowest sched_domain of a cpu which contains the given flag.
4321 */
4322static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4323{
4324 struct sched_domain *sd;
4325
4326 for_each_domain(cpu, sd)
4327 if (sd && (sd->flags & flag))
4328 break;
4329
4330 return sd;
4331}
4332
4333/**
4334 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4335 * @cpu: The cpu whose domains we're iterating over.
4336 * @sd: variable holding the value of the power_savings_sd
4337 * for cpu.
4338 * @flag: The flag to filter the sched_domains to be iterated.
4339 *
4340 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4341 * set, starting from the lowest sched_domain to the highest.
4342 */
4343#define for_each_flag_domain(cpu, sd, flag) \
4344 for (sd = lowest_flag_domain(cpu, flag); \
4345 (sd && (sd->flags & flag)); sd = sd->parent)
4346
4347/**
4348 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4349 * @ilb_group: group to be checked for semi-idleness
4350 *
4351 * Returns: 1 if the group is semi-idle. 0 otherwise.
4352 *
4353 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4354 * and atleast one non-idle CPU. This helper function checks if the given
4355 * sched_group is semi-idle or not.
4356 */
4357static inline int is_semi_idle_group(struct sched_group *ilb_group)
4358{
4359 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4360 sched_group_cpus(ilb_group));
4361
4362 /*
4363 * A sched_group is semi-idle when it has atleast one busy cpu
4364 * and atleast one idle cpu.
4365 */
4366 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4367 return 0;
4368
4369 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4370 return 0;
4371
4372 return 1;
4373}
4374/**
4375 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4376 * @cpu: The cpu which is nominating a new idle_load_balancer.
4377 *
4378 * Returns: Returns the id of the idle load balancer if it exists,
4379 * Else, returns >= nr_cpu_ids.
4380 *
4381 * This algorithm picks the idle load balancer such that it belongs to a
4382 * semi-idle powersavings sched_domain. The idea is to try and avoid
4383 * completely idle packages/cores just for the purpose of idle load balancing
4384 * when there are other idle cpu's which are better suited for that job.
4385 */
4386static int find_new_ilb(int cpu)
4387{
4388 struct sched_domain *sd;
4389 struct sched_group *ilb_group;
4390
4391 /*
4392 * Have idle load balancer selection from semi-idle packages only
4393 * when power-aware load balancing is enabled
4394 */
4395 if (!(sched_smt_power_savings || sched_mc_power_savings))
4396 goto out_done;
4397
4398 /*
4399 * Optimize for the case when we have no idle CPUs or only one
4400 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4401 */
4402 if (cpumask_weight(nohz.cpu_mask) < 2)
4403 goto out_done;
4404
4405 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4406 ilb_group = sd->groups;
4407
4408 do {
4409 if (is_semi_idle_group(ilb_group))
4410 return cpumask_first(nohz.ilb_grp_nohz_mask);
4411
4412 ilb_group = ilb_group->next;
4413
4414 } while (ilb_group != sd->groups);
4415 }
4416
4417out_done:
4418 return cpumask_first(nohz.cpu_mask);
4419}
4420#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4421static inline int find_new_ilb(int call_cpu)
4422{
4423 return cpumask_first(nohz.cpu_mask);
4424}
4425#endif
4426
4247/* 4427/*
4248 * This routine will try to nominate the ilb (idle load balancing) 4428 * This routine will try to nominate the ilb (idle load balancing)
4249 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4429 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4478,24 @@ int select_nohz_load_balancer(int stop_tick)
4298 /* make me the ilb owner */ 4478 /* make me the ilb owner */
4299 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4479 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300 return 1; 4480 return 1;
4301 } else if (atomic_read(&nohz.load_balancer) == cpu) 4481 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4482 int new_ilb;
4483
4484 if (!(sched_smt_power_savings ||
4485 sched_mc_power_savings))
4486 return 1;
4487 /*
4488 * Check to see if there is a more power-efficient
4489 * ilb.
4490 */
4491 new_ilb = find_new_ilb(cpu);
4492 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4493 atomic_set(&nohz.load_balancer, -1);
4494 resched_cpu(new_ilb);
4495 return 0;
4496 }
4302 return 1; 4497 return 1;
4498 }
4303 } else { 4499 } else {
4304 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4500 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305 return 0; 4501 return 0;
@@ -4468,15 +4664,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468 } 4664 }
4469 4665
4470 if (atomic_read(&nohz.load_balancer) == -1) { 4666 if (atomic_read(&nohz.load_balancer) == -1) {
4471 /* 4667 int ilb = find_new_ilb(cpu);
4472 * simple selection for now: Nominate the
4473 * first cpu in the nohz list to be the next
4474 * ilb owner.
4475 *
4476 * TBD: Traverse the sched domains and nominate
4477 * the nearest cpu in the nohz.cpu_mask.
4478 */
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480 4668
4481 if (ilb < nr_cpu_ids) 4669 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb); 4670 resched_cpu(ilb);
@@ -5007,13 +5195,15 @@ pick_next_task(struct rq *rq)
5007/* 5195/*
5008 * schedule() is the main scheduler function. 5196 * schedule() is the main scheduler function.
5009 */ 5197 */
5010asmlinkage void __sched __schedule(void) 5198asmlinkage void __sched schedule(void)
5011{ 5199{
5012 struct task_struct *prev, *next; 5200 struct task_struct *prev, *next;
5013 unsigned long *switch_count; 5201 unsigned long *switch_count;
5014 struct rq *rq; 5202 struct rq *rq;
5015 int cpu; 5203 int cpu;
5016 5204
5205need_resched:
5206 preempt_disable();
5017 cpu = smp_processor_id(); 5207 cpu = smp_processor_id();
5018 rq = cpu_rq(cpu); 5208 rq = cpu_rq(cpu);
5019 rcu_qsctr_inc(cpu); 5209 rcu_qsctr_inc(cpu);
@@ -5070,15 +5260,9 @@ need_resched_nonpreemptible:
5070 5260
5071 if (unlikely(reacquire_kernel_lock(current) < 0)) 5261 if (unlikely(reacquire_kernel_lock(current) < 0))
5072 goto need_resched_nonpreemptible; 5262 goto need_resched_nonpreemptible;
5073}
5074 5263
5075asmlinkage void __sched schedule(void)
5076{
5077need_resched:
5078 preempt_disable();
5079 __schedule();
5080 preempt_enable_no_resched(); 5264 preempt_enable_no_resched();
5081 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5265 if (need_resched())
5082 goto need_resched; 5266 goto need_resched;
5083} 5267}
5084EXPORT_SYMBOL(schedule); 5268EXPORT_SYMBOL(schedule);
@@ -5221,7 +5405,7 @@ EXPORT_SYMBOL(default_wake_function);
5221 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5405 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5222 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5406 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5223 */ 5407 */
5224void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5408static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5225 int nr_exclusive, int sync, void *key) 5409 int nr_exclusive, int sync, void *key)
5226{ 5410{
5227 wait_queue_t *curr, *next; 5411 wait_queue_t *curr, *next;
@@ -6490,8 +6674,9 @@ void sched_show_task(struct task_struct *p)
6490#ifdef CONFIG_DEBUG_STACK_USAGE 6674#ifdef CONFIG_DEBUG_STACK_USAGE
6491 free = stack_not_used(p); 6675 free = stack_not_used(p);
6492#endif 6676#endif
6493 printk(KERN_CONT "%5lu %5d %6d\n", free, 6677 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6494 task_pid_nr(p), task_pid_nr(p->real_parent)); 6678 task_pid_nr(p), task_pid_nr(p->real_parent),
6679 (unsigned long)task_thread_info(p)->flags);
6495 6680
6496 show_stack(p, NULL); 6681 show_stack(p, NULL);
6497} 6682}
@@ -6970,6 +7155,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6970 7155
6971 } 7156 }
6972} 7157}
7158
7159/*
7160 * remove the tasks which were accounted by rq from calc_load_tasks.
7161 */
7162static void calc_global_load_remove(struct rq *rq)
7163{
7164 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7165}
6973#endif /* CONFIG_HOTPLUG_CPU */ 7166#endif /* CONFIG_HOTPLUG_CPU */
6974 7167
6975#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7168#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7397,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7204 /* Update our root-domain */ 7397 /* Update our root-domain */
7205 rq = cpu_rq(cpu); 7398 rq = cpu_rq(cpu);
7206 spin_lock_irqsave(&rq->lock, flags); 7399 spin_lock_irqsave(&rq->lock, flags);
7400 rq->calc_load_update = calc_load_update;
7401 rq->calc_load_active = 0;
7207 if (rq->rd) { 7402 if (rq->rd) {
7208 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7403 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7209 7404
@@ -7243,7 +7438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7243 cpuset_unlock(); 7438 cpuset_unlock();
7244 migrate_nr_uninterruptible(rq); 7439 migrate_nr_uninterruptible(rq);
7245 BUG_ON(rq->nr_running != 0); 7440 BUG_ON(rq->nr_running != 0);
7246 7441 calc_global_load_remove(rq);
7247 /* 7442 /*
7248 * No need to migrate the tasks: it was best-effort if 7443 * No need to migrate the tasks: it was best-effort if
7249 * they didn't take sched_hotcpu_mutex. Just wake up 7444 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7753,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7753 7948
7754/* 7949/*
7755 * The cpus mask in sched_group and sched_domain hangs off the end. 7950 * The cpus mask in sched_group and sched_domain hangs off the end.
7756 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 7951 *
7757 * for nr_cpu_ids < CONFIG_NR_CPUS. 7952 * ( See the the comments in include/linux/sched.h:struct sched_group
7953 * and struct sched_domain. )
7758 */ 7954 */
7759struct static_sched_group { 7955struct static_sched_group {
7760 struct sched_group sg; 7956 struct sched_group sg;
@@ -7875,7 +8071,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7875 struct sched_domain *sd; 8071 struct sched_domain *sd;
7876 8072
7877 sd = &per_cpu(phys_domains, j).sd; 8073 sd = &per_cpu(phys_domains, j).sd;
7878 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8074 if (j != group_first_cpu(sd->groups)) {
7879 /* 8075 /*
7880 * Only add "power" once for each 8076 * Only add "power" once for each
7881 * physical package. 8077 * physical package.
@@ -7953,7 +8149,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7953 8149
7954 WARN_ON(!sd || !sd->groups); 8150 WARN_ON(!sd || !sd->groups);
7955 8151
7956 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8152 if (cpu != group_first_cpu(sd->groups))
7957 return; 8153 return;
7958 8154
7959 child = sd->child; 8155 child = sd->child;
@@ -8938,6 +9134,8 @@ void __init sched_init(void)
8938 rq = cpu_rq(i); 9134 rq = cpu_rq(i);
8939 spin_lock_init(&rq->lock); 9135 spin_lock_init(&rq->lock);
8940 rq->nr_running = 0; 9136 rq->nr_running = 0;
9137 rq->calc_load_active = 0;
9138 rq->calc_load_update = jiffies + LOAD_FREQ;
8941 init_cfs_rq(&rq->cfs, rq); 9139 init_cfs_rq(&rq->cfs, rq);
8942 init_rt_rq(&rq->rt, rq); 9140 init_rt_rq(&rq->rt, rq);
8943#ifdef CONFIG_FAIR_GROUP_SCHED 9141#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9045,6 +9243,9 @@ void __init sched_init(void)
9045 * when this runqueue becomes "idle". 9243 * when this runqueue becomes "idle".
9046 */ 9244 */
9047 init_idle(current, smp_processor_id()); 9245 init_idle(current, smp_processor_id());
9246
9247 calc_load_update = jiffies + LOAD_FREQ;
9248
9048 /* 9249 /*
9049 * During early bootup we pretend to be a normal task: 9250 * During early bootup we pretend to be a normal task:
9050 */ 9251 */
@@ -9055,6 +9256,7 @@ void __init sched_init(void)
9055#ifdef CONFIG_SMP 9256#ifdef CONFIG_SMP
9056#ifdef CONFIG_NO_HZ 9257#ifdef CONFIG_NO_HZ
9057 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9258 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
9259 alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
9058#endif 9260#endif
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9261 alloc_bootmem_cpumask_var(&cpu_isolated_map);
9060#endif /* SMP */ 9262#endif /* SMP */
@@ -9800,6 +10002,13 @@ static int sched_rt_global_constraints(void)
9800 if (sysctl_sched_rt_period <= 0) 10002 if (sysctl_sched_rt_period <= 0)
9801 return -EINVAL; 10003 return -EINVAL;
9802 10004
10005 /*
10006 * There's always some RT tasks in the root group
10007 * -- migration, kstopmachine etc..
10008 */
10009 if (sysctl_sched_rt_runtime == 0)
10010 return -EBUSY;
10011
9803 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10012 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9804 for_each_possible_cpu(i) { 10013 for_each_possible_cpu(i) {
9805 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10014 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f119..5f9650e8fe75 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1487 1487
1488 find_matching_se(&se, &pse); 1488 find_matching_se(&se, &pse);
1489 1489
1490 while (se) { 1490 BUG_ON(!pse);
1491 BUG_ON(!pse);
1492 1491
1493 if (wakeup_preempt_entity(se, pse) == 1) { 1492 if (wakeup_preempt_entity(se, pse) == 1)
1494 resched_task(curr); 1493 resched_task(curr);
1495 break;
1496 }
1497
1498 se = parent_entity(se);
1499 pse = parent_entity(pse);
1500 }
1501} 1494}
1502 1495
1503static struct task_struct *pick_next_task_fair(struct rq *rq) 1496static struct task_struct *pick_next_task_fair(struct rq *rq)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13..499672c10cbd 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
22static struct task_struct *pick_next_task_idle(struct rq *rq) 22static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 23{
24 schedstat_inc(rq, sched_goidle); 24 schedstat_inc(rq, sched_goidle);
25 25 /* adjust the active tasks as we might go into a long sleep */
26 calc_load_account_active(rq);
26 return rq->idle; 27 return rq->idle;
27} 28}
28 29
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7..52a8bf8931f3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * This read-write spinlock protects us from races in SMP while 24 * This read-write spinlock protects us from races in SMP while
25 * playing with xtime and avenrun. 25 * playing with xtime.
26 */ 26 */
27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
28 28
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..a26ed294f938 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1123,47 +1123,6 @@ void update_process_times(int user_tick)
1123} 1123}
1124 1124
1125/* 1125/*
1126 * Nr of active tasks - counted in fixed-point numbers
1127 */
1128static unsigned long count_active_tasks(void)
1129{
1130 return nr_active() * FIXED_1;
1131}
1132
1133/*
1134 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
1135 * imply that avenrun[] is the standard name for this kind of thing.
1136 * Nothing else seems to be standardized: the fractional size etc
1137 * all seem to differ on different machines.
1138 *
1139 * Requires xtime_lock to access.
1140 */
1141unsigned long avenrun[3];
1142
1143EXPORT_SYMBOL(avenrun);
1144
1145/*
1146 * calc_load - given tick count, update the avenrun load estimates.
1147 * This is called while holding a write_lock on xtime_lock.
1148 */
1149static inline void calc_load(unsigned long ticks)
1150{
1151 unsigned long active_tasks; /* fixed-point */
1152 static int count = LOAD_FREQ;
1153
1154 count -= ticks;
1155 if (unlikely(count < 0)) {
1156 active_tasks = count_active_tasks();
1157 do {
1158 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1159 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1160 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
1161 count += LOAD_FREQ;
1162 } while (count < 0);
1163 }
1164}
1165
1166/*
1167 * This function runs timers and the timer-tq in bottom half context. 1126 * This function runs timers and the timer-tq in bottom half context.
1168 */ 1127 */
1169static void run_timer_softirq(struct softirq_action *h) 1128static void run_timer_softirq(struct softirq_action *h)
@@ -1187,16 +1146,6 @@ void run_local_timers(void)
1187} 1146}
1188 1147
1189/* 1148/*
1190 * Called by the timer interrupt. xtime_lock must already be taken
1191 * by the timer IRQ!
1192 */
1193static inline void update_times(unsigned long ticks)
1194{
1195 update_wall_time();
1196 calc_load(ticks);
1197}
1198
1199/*
1200 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1149 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1201 * without sampling the sequence number in xtime_lock. 1150 * without sampling the sequence number in xtime_lock.
1202 * jiffies is defined in the linker script... 1151 * jiffies is defined in the linker script...
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks)
1205void do_timer(unsigned long ticks) 1154void do_timer(unsigned long ticks)
1206{ 1155{
1207 jiffies_64 += ticks; 1156 jiffies_64 += ticks;
1208 update_times(ticks); 1157 update_wall_time();
1158 calc_global_load();
1209} 1159}
1210 1160
1211#ifdef __ARCH_WANT_SYS_ALARM 1161#ifdef __ARCH_WANT_SYS_ALARM
@@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info)
1406{ 1356{
1407 unsigned long mem_total, sav_total; 1357 unsigned long mem_total, sav_total;
1408 unsigned int mem_unit, bitcount; 1358 unsigned int mem_unit, bitcount;
1409 unsigned long seq; 1359 struct timespec tp;
1410 1360
1411 memset(info, 0, sizeof(struct sysinfo)); 1361 memset(info, 0, sizeof(struct sysinfo));
1412 1362
1413 do { 1363 ktime_get_ts(&tp);
1414 struct timespec tp; 1364 monotonic_to_bootbased(&tp);
1415 seq = read_seqbegin(&xtime_lock); 1365 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1416
1417 /*
1418 * This is annoying. The below is the same thing
1419 * posix_get_clock_monotonic() does, but it wants to
1420 * take the lock which we want to cover the loads stuff
1421 * too.
1422 */
1423
1424 getnstimeofday(&tp);
1425 tp.tv_sec += wall_to_monotonic.tv_sec;
1426 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1427 monotonic_to_bootbased(&tp);
1428 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1429 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1430 tp.tv_sec++;
1431 }
1432 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1433 1366
1434 info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); 1367 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1435 info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1436 info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1437 1368
1438 info->procs = nr_threads; 1369 info->procs = nr_threads;
1439 } while (read_seqretry(&xtime_lock, seq));
1440 1370
1441 si_meminfo(info); 1371 si_meminfo(info);
1442 si_swapinfo(info); 1372 si_swapinfo(info);
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c8..ea7c3b4275cf 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
154 if (!list_empty(&wait->task_list)) 154 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list); 155 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q)) 156 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key); 157 __wake_up_locked_key(q, mode, key);
158 spin_unlock_irqrestore(&q->lock, flags); 158 spin_unlock_irqrestore(&q->lock, flags);
159} 159}
160EXPORT_SYMBOL(abort_exclusive_wait); 160EXPORT_SYMBOL(abort_exclusive_wait);