aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-06-10 18:32:59 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-10 18:32:59 -0400
commit99e97b860e14c64760855198e91d1166697131a7 (patch)
treefadc8368c3f784bff92fba82d983e7861559cf9d
parent82782ca77d1bfb32b0334cce40a25b91bd8ec016 (diff)
parentf04d82b7e0c63d0251f9952a537a4bc4d73aa1a9 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: sched: fix typo in sched-rt-group.txt file ftrace: fix typo about map of kernel priority in ftrace.txt file. sched: properly define the sched_group::cpumask and sched_domain::span fields sched, timers: cleanup avenrun users sched, timers: move calc_load() to scheduler sched: Don't export sched_mc_power_savings on multi-socket single core system sched: emit thread info flags with stack trace sched: rt: document the risk of small values in the bandwidth settings sched: Replace first_cpu() with cpumask_first() in ILB nomination code sched: remove extra call overhead for schedule() sched: use group_first_cpu() instead of cpumask_first(sched_group_cpus()) wait: don't use __wake_up_common() sched: Nominate a power-efficient ilb in select_nohz_balancer() sched: Nominate idle load balancer from a semi-idle package. sched: remove redundant hierarchy walk in check_preempt_wakeup
-rw-r--r--Documentation/scheduler/sched-rt-group.txt20
-rw-r--r--Documentation/trace/ftrace.txt15
-rw-r--r--arch/x86/include/asm/topology.h3
-rw-r--r--fs/proc/loadavg.c18
-rw-r--r--include/linux/sched.h28
-rw-r--r--include/linux/wait.h2
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/sched.c279
-rw-r--r--kernel/sched_fair.c13
-rw-r--r--kernel/sched_idletask.c3
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/timer.c86
-rw-r--r--kernel/wait.c2
13 files changed, 325 insertions, 150 deletions
diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 5ba4d3fc625a..1df7f9cdab05 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -4,6 +4,7 @@
4CONTENTS 4CONTENTS
5======== 5========
6 6
70. WARNING
71. Overview 81. Overview
8 1.1 The problem 9 1.1 The problem
9 1.2 The solution 10 1.2 The solution
@@ -14,6 +15,23 @@ CONTENTS
143. Future plans 153. Future plans
15 16
16 17
180. WARNING
19==========
20
21 Fiddling with these settings can result in an unstable system, the knobs are
22 root only and assumes root knows what he is doing.
23
24Most notable:
25
26 * very small values in sched_rt_period_us can result in an unstable
27 system when the period is smaller than either the available hrtimer
28 resolution, or the time it takes to handle the budget refresh itself.
29
30 * very small values in sched_rt_runtime_us can result in an unstable
31 system when the runtime is so small the system has difficulty making
32 forward progress (NOTE: the migration thread and kstopmachine both
33 are real-time processes).
34
171. Overview 351. Overview
18=========== 36===========
19 37
@@ -169,7 +187,7 @@ get their allocated time.
169 187
170Implementing SCHED_EDF might take a while to complete. Priority Inheritance is 188Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
171the biggest challenge as the current linux PI infrastructure is geared towards 189the biggest challenge as the current linux PI infrastructure is geared towards
172the limited static priority levels 0-139. With deadline scheduling you need to 190the limited static priority levels 0-99. With deadline scheduling you need to
173do deadline inheritance (since priority is inversely proportional to the 191do deadline inheritance (since priority is inversely proportional to the
174deadline delta (deadline - now). 192deadline delta (deadline - now).
175 193
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index fd9a3e693813..e362f50c496f 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice
518values starting at 100 (nice -20). Below is a quick chart to map 518values starting at 100 (nice -20). Below is a quick chart to map
519the kernel priority to user land priorities. 519the kernel priority to user land priorities.
520 520
521 Kernel priority: 0 to 99 ==> user RT priority 99 to 0 521 Kernel Space User Space
522 Kernel priority: 100 to 139 ==> user nice -20 to 19 522 ===============================================================
523 Kernel priority: 140 ==> idle task priority 523 0(high) to 98(low) user RT priority 99(high) to 1(low)
524 with SCHED_RR or SCHED_FIFO
525 ---------------------------------------------------------------
526 99 sched_priority is not used in scheduling
527 decisions(it must be specified as 0)
528 ---------------------------------------------------------------
529 100(high) to 139(low) user nice -20(high) to 19(low)
530 ---------------------------------------------------------------
531 140 idle task priority
532 ---------------------------------------------------------------
524 533
525The task states are: 534The task states are:
526 535
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f44b49abca49..066ef590d7e0 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -203,7 +203,8 @@ struct pci_bus;
203void x86_pci_root_bus_res_quirks(struct pci_bus *b); 203void x86_pci_root_bus_res_quirks(struct pci_bus *b);
204 204
205#ifdef CONFIG_SMP 205#ifdef CONFIG_SMP
206#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) 206#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \
207 (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
207#define smt_capable() (smp_num_siblings > 1) 208#define smt_capable() (smp_num_siblings > 1)
208#endif 209#endif
209 210
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bca39cf99ee..1afa4dd4cae2 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
12 12
13static int loadavg_proc_show(struct seq_file *m, void *v) 13static int loadavg_proc_show(struct seq_file *m, void *v)
14{ 14{
15 int a, b, c; 15 unsigned long avnrun[3];
16 unsigned long seq;
17 16
18 do { 17 get_avenrun(avnrun, FIXED_1/200, 0);
19 seq = read_seqbegin(&xtime_lock);
20 a = avenrun[0] + (FIXED_1/200);
21 b = avenrun[1] + (FIXED_1/200);
22 c = avenrun[2] + (FIXED_1/200);
23 } while (read_seqretry(&xtime_lock, seq));
24 18
25 seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", 19 seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
26 LOAD_INT(a), LOAD_FRAC(a), 20 LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
27 LOAD_INT(b), LOAD_FRAC(b), 21 LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
28 LOAD_INT(c), LOAD_FRAC(c), 22 LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
29 nr_running(), nr_threads, 23 nr_running(), nr_threads,
30 task_active_pid_ns(current)->last_pid); 24 task_active_pid_ns(current)->last_pid);
31 return 0; 25 return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049c..dbb1043e8656 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -116,6 +116,7 @@ struct fs_struct;
116 * 11 bit fractions. 116 * 11 bit fractions.
117 */ 117 */
118extern unsigned long avenrun[]; /* Load averages */ 118extern unsigned long avenrun[]; /* Load averages */
119extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
119 120
120#define FSHIFT 11 /* nr of bits of precision */ 121#define FSHIFT 11 /* nr of bits of precision */
121#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ 122#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
@@ -135,8 +136,8 @@ DECLARE_PER_CPU(unsigned long, process_counts);
135extern int nr_processes(void); 136extern int nr_processes(void);
136extern unsigned long nr_running(void); 137extern unsigned long nr_running(void);
137extern unsigned long nr_uninterruptible(void); 138extern unsigned long nr_uninterruptible(void);
138extern unsigned long nr_active(void);
139extern unsigned long nr_iowait(void); 139extern unsigned long nr_iowait(void);
140extern void calc_global_load(void);
140 141
141extern unsigned long get_parent_ip(unsigned long addr); 142extern unsigned long get_parent_ip(unsigned long addr);
142 143
@@ -838,7 +839,17 @@ struct sched_group {
838 */ 839 */
839 u32 reciprocal_cpu_power; 840 u32 reciprocal_cpu_power;
840 841
841 unsigned long cpumask[]; 842 /*
843 * The CPUs this group covers.
844 *
845 * NOTE: this field is variable length. (Allocated dynamically
846 * by attaching extra space to the end of the structure,
847 * depending on how many CPUs the kernel has booted up with)
848 *
849 * It is also be embedded into static data structures at build
850 * time. (See 'struct static_sched_group' in kernel/sched.c)
851 */
852 unsigned long cpumask[0];
842}; 853};
843 854
844static inline struct cpumask *sched_group_cpus(struct sched_group *sg) 855static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -924,8 +935,17 @@ struct sched_domain {
924 char *name; 935 char *name;
925#endif 936#endif
926 937
927 /* span of all CPUs in this domain */ 938 /*
928 unsigned long span[]; 939 * Span of all CPUs in this domain.
940 *
941 * NOTE: this field is variable length. (Allocated dynamically
942 * by attaching extra space to the end of the structure,
943 * depending on how many CPUs the kernel has booted up with)
944 *
945 * It is also be embedded into static data structures at build
946 * time. (See 'struct static_sched_domain' in kernel/sched.c)
947 */
948 unsigned long span[0];
929}; 949};
930 950
931static inline struct cpumask *sched_domain_span(struct sched_domain *sd) 951static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
diff --git a/include/linux/wait.h b/include/linux/wait.h
index bc024632f365..6788e1a4d4ca 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
132 list_del(&old->task_list); 132 list_del(&old->task_list);
133} 133}
134 134
135void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
136 int nr_exclusive, int sync, void *key);
137void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); 135void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
138void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); 136void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
139void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, 137void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..6ca5fe96e393 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
249 249
250 /* didnt get the lock, go to sleep: */ 250 /* didnt get the lock, go to sleep: */
251 spin_unlock_mutex(&lock->wait_lock, flags); 251 spin_unlock_mutex(&lock->wait_lock, flags);
252 __schedule(); 252 preempt_enable_no_resched();
253 schedule();
254 preempt_disable();
253 spin_lock_mutex(&lock->wait_lock, flags); 255 spin_lock_mutex(&lock->wait_lock, flags);
254 } 256 }
255 257
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..228acae8821f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
630 struct list_head migration_queue; 630 struct list_head migration_queue;
631#endif 631#endif
632 632
633 /* calc_load related fields */
634 unsigned long calc_load_update;
635 long calc_load_active;
636
633#ifdef CONFIG_SCHED_HRTICK 637#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 638#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 639 int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1732}
1729#endif 1733#endif
1730 1734
1735static void calc_load_account_active(struct rq *this_rq);
1736
1731#include "sched_stats.h" 1737#include "sched_stats.h"
1732#include "sched_idletask.c" 1738#include "sched_idletask.c"
1733#include "sched_fair.c" 1739#include "sched_fair.c"
@@ -2856,19 +2862,72 @@ unsigned long nr_iowait(void)
2856 return sum; 2862 return sum;
2857} 2863}
2858 2864
2859unsigned long nr_active(void) 2865/* Variables and functions for calc_load */
2866static atomic_long_t calc_load_tasks;
2867static unsigned long calc_load_update;
2868unsigned long avenrun[3];
2869EXPORT_SYMBOL(avenrun);
2870
2871/**
2872 * get_avenrun - get the load average array
2873 * @loads: pointer to dest load array
2874 * @offset: offset to add
2875 * @shift: shift count to shift the result left
2876 *
2877 * These values are estimates at best, so no need for locking.
2878 */
2879void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2860{ 2880{
2861 unsigned long i, running = 0, uninterruptible = 0; 2881 loads[0] = (avenrun[0] + offset) << shift;
2882 loads[1] = (avenrun[1] + offset) << shift;
2883 loads[2] = (avenrun[2] + offset) << shift;
2884}
2862 2885
2863 for_each_online_cpu(i) { 2886static unsigned long
2864 running += cpu_rq(i)->nr_running; 2887calc_load(unsigned long load, unsigned long exp, unsigned long active)
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2888{
2866 } 2889 load *= exp;
2890 load += active * (FIXED_1 - exp);
2891 return load >> FSHIFT;
2892}
2867 2893
2868 if (unlikely((long)uninterruptible < 0)) 2894/*
2869 uninterruptible = 0; 2895 * calc_load - update the avenrun load estimates 10 ticks after the
2896 * CPUs have updated calc_load_tasks.
2897 */
2898void calc_global_load(void)
2899{
2900 unsigned long upd = calc_load_update + 10;
2901 long active;
2870 2902
2871 return running + uninterruptible; 2903 if (time_before(jiffies, upd))
2904 return;
2905
2906 active = atomic_long_read(&calc_load_tasks);
2907 active = active > 0 ? active * FIXED_1 : 0;
2908
2909 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2910 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2911 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2912
2913 calc_load_update += LOAD_FREQ;
2914}
2915
2916/*
2917 * Either called from update_cpu_load() or from a cpu going idle
2918 */
2919static void calc_load_account_active(struct rq *this_rq)
2920{
2921 long nr_active, delta;
2922
2923 nr_active = this_rq->nr_running;
2924 nr_active += (long) this_rq->nr_uninterruptible;
2925
2926 if (nr_active != this_rq->calc_load_active) {
2927 delta = nr_active - this_rq->calc_load_active;
2928 this_rq->calc_load_active = nr_active;
2929 atomic_long_add(delta, &calc_load_tasks);
2930 }
2872} 2931}
2873 2932
2874/* 2933/*
@@ -2899,6 +2958,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 2958 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 2959 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 2960 }
2961
2962 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
2963 this_rq->calc_load_update += LOAD_FREQ;
2964 calc_load_account_active(this_rq);
2965 }
2902} 2966}
2903 2967
2904#ifdef CONFIG_SMP 2968#ifdef CONFIG_SMP
@@ -4240,10 +4304,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4304static struct {
4241 atomic_t load_balancer; 4305 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4306 cpumask_var_t cpu_mask;
4307 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4308} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4309 .load_balancer = ATOMIC_INIT(-1),
4245}; 4310};
4246 4311
4312#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4313/**
4314 * lowest_flag_domain - Return lowest sched_domain containing flag.
4315 * @cpu: The cpu whose lowest level of sched domain is to
4316 * be returned.
4317 * @flag: The flag to check for the lowest sched_domain
4318 * for the given cpu.
4319 *
4320 * Returns the lowest sched_domain of a cpu which contains the given flag.
4321 */
4322static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4323{
4324 struct sched_domain *sd;
4325
4326 for_each_domain(cpu, sd)
4327 if (sd && (sd->flags & flag))
4328 break;
4329
4330 return sd;
4331}
4332
4333/**
4334 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4335 * @cpu: The cpu whose domains we're iterating over.
4336 * @sd: variable holding the value of the power_savings_sd
4337 * for cpu.
4338 * @flag: The flag to filter the sched_domains to be iterated.
4339 *
4340 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4341 * set, starting from the lowest sched_domain to the highest.
4342 */
4343#define for_each_flag_domain(cpu, sd, flag) \
4344 for (sd = lowest_flag_domain(cpu, flag); \
4345 (sd && (sd->flags & flag)); sd = sd->parent)
4346
4347/**
4348 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4349 * @ilb_group: group to be checked for semi-idleness
4350 *
4351 * Returns: 1 if the group is semi-idle. 0 otherwise.
4352 *
4353 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4354 * and atleast one non-idle CPU. This helper function checks if the given
4355 * sched_group is semi-idle or not.
4356 */
4357static inline int is_semi_idle_group(struct sched_group *ilb_group)
4358{
4359 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4360 sched_group_cpus(ilb_group));
4361
4362 /*
4363 * A sched_group is semi-idle when it has atleast one busy cpu
4364 * and atleast one idle cpu.
4365 */
4366 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4367 return 0;
4368
4369 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4370 return 0;
4371
4372 return 1;
4373}
4374/**
4375 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4376 * @cpu: The cpu which is nominating a new idle_load_balancer.
4377 *
4378 * Returns: Returns the id of the idle load balancer if it exists,
4379 * Else, returns >= nr_cpu_ids.
4380 *
4381 * This algorithm picks the idle load balancer such that it belongs to a
4382 * semi-idle powersavings sched_domain. The idea is to try and avoid
4383 * completely idle packages/cores just for the purpose of idle load balancing
4384 * when there are other idle cpu's which are better suited for that job.
4385 */
4386static int find_new_ilb(int cpu)
4387{
4388 struct sched_domain *sd;
4389 struct sched_group *ilb_group;
4390
4391 /*
4392 * Have idle load balancer selection from semi-idle packages only
4393 * when power-aware load balancing is enabled
4394 */
4395 if (!(sched_smt_power_savings || sched_mc_power_savings))
4396 goto out_done;
4397
4398 /*
4399 * Optimize for the case when we have no idle CPUs or only one
4400 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4401 */
4402 if (cpumask_weight(nohz.cpu_mask) < 2)
4403 goto out_done;
4404
4405 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4406 ilb_group = sd->groups;
4407
4408 do {
4409 if (is_semi_idle_group(ilb_group))
4410 return cpumask_first(nohz.ilb_grp_nohz_mask);
4411
4412 ilb_group = ilb_group->next;
4413
4414 } while (ilb_group != sd->groups);
4415 }
4416
4417out_done:
4418 return cpumask_first(nohz.cpu_mask);
4419}
4420#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4421static inline int find_new_ilb(int call_cpu)
4422{
4423 return cpumask_first(nohz.cpu_mask);
4424}
4425#endif
4426
4247/* 4427/*
4248 * This routine will try to nominate the ilb (idle load balancing) 4428 * This routine will try to nominate the ilb (idle load balancing)
4249 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4429 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4478,24 @@ int select_nohz_load_balancer(int stop_tick)
4298 /* make me the ilb owner */ 4478 /* make me the ilb owner */
4299 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4479 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300 return 1; 4480 return 1;
4301 } else if (atomic_read(&nohz.load_balancer) == cpu) 4481 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4482 int new_ilb;
4483
4484 if (!(sched_smt_power_savings ||
4485 sched_mc_power_savings))
4486 return 1;
4487 /*
4488 * Check to see if there is a more power-efficient
4489 * ilb.
4490 */
4491 new_ilb = find_new_ilb(cpu);
4492 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4493 atomic_set(&nohz.load_balancer, -1);
4494 resched_cpu(new_ilb);
4495 return 0;
4496 }
4302 return 1; 4497 return 1;
4498 }
4303 } else { 4499 } else {
4304 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4500 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305 return 0; 4501 return 0;
@@ -4468,15 +4664,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468 } 4664 }
4469 4665
4470 if (atomic_read(&nohz.load_balancer) == -1) { 4666 if (atomic_read(&nohz.load_balancer) == -1) {
4471 /* 4667 int ilb = find_new_ilb(cpu);
4472 * simple selection for now: Nominate the
4473 * first cpu in the nohz list to be the next
4474 * ilb owner.
4475 *
4476 * TBD: Traverse the sched domains and nominate
4477 * the nearest cpu in the nohz.cpu_mask.
4478 */
4479 int ilb = cpumask_first(nohz.cpu_mask);
4480 4668
4481 if (ilb < nr_cpu_ids) 4669 if (ilb < nr_cpu_ids)
4482 resched_cpu(ilb); 4670 resched_cpu(ilb);
@@ -5007,13 +5195,15 @@ pick_next_task(struct rq *rq)
5007/* 5195/*
5008 * schedule() is the main scheduler function. 5196 * schedule() is the main scheduler function.
5009 */ 5197 */
5010asmlinkage void __sched __schedule(void) 5198asmlinkage void __sched schedule(void)
5011{ 5199{
5012 struct task_struct *prev, *next; 5200 struct task_struct *prev, *next;
5013 unsigned long *switch_count; 5201 unsigned long *switch_count;
5014 struct rq *rq; 5202 struct rq *rq;
5015 int cpu; 5203 int cpu;
5016 5204
5205need_resched:
5206 preempt_disable();
5017 cpu = smp_processor_id(); 5207 cpu = smp_processor_id();
5018 rq = cpu_rq(cpu); 5208 rq = cpu_rq(cpu);
5019 rcu_qsctr_inc(cpu); 5209 rcu_qsctr_inc(cpu);
@@ -5070,15 +5260,9 @@ need_resched_nonpreemptible:
5070 5260
5071 if (unlikely(reacquire_kernel_lock(current) < 0)) 5261 if (unlikely(reacquire_kernel_lock(current) < 0))
5072 goto need_resched_nonpreemptible; 5262 goto need_resched_nonpreemptible;
5073}
5074 5263
5075asmlinkage void __sched schedule(void)
5076{
5077need_resched:
5078 preempt_disable();
5079 __schedule();
5080 preempt_enable_no_resched(); 5264 preempt_enable_no_resched();
5081 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5265 if (need_resched())
5082 goto need_resched; 5266 goto need_resched;
5083} 5267}
5084EXPORT_SYMBOL(schedule); 5268EXPORT_SYMBOL(schedule);
@@ -5221,7 +5405,7 @@ EXPORT_SYMBOL(default_wake_function);
5221 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5405 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5222 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5406 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5223 */ 5407 */
5224void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5408static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5225 int nr_exclusive, int sync, void *key) 5409 int nr_exclusive, int sync, void *key)
5226{ 5410{
5227 wait_queue_t *curr, *next; 5411 wait_queue_t *curr, *next;
@@ -6490,8 +6674,9 @@ void sched_show_task(struct task_struct *p)
6490#ifdef CONFIG_DEBUG_STACK_USAGE 6674#ifdef CONFIG_DEBUG_STACK_USAGE
6491 free = stack_not_used(p); 6675 free = stack_not_used(p);
6492#endif 6676#endif
6493 printk(KERN_CONT "%5lu %5d %6d\n", free, 6677 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6494 task_pid_nr(p), task_pid_nr(p->real_parent)); 6678 task_pid_nr(p), task_pid_nr(p->real_parent),
6679 (unsigned long)task_thread_info(p)->flags);
6495 6680
6496 show_stack(p, NULL); 6681 show_stack(p, NULL);
6497} 6682}
@@ -6970,6 +7155,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6970 7155
6971 } 7156 }
6972} 7157}
7158
7159/*
7160 * remove the tasks which were accounted by rq from calc_load_tasks.
7161 */
7162static void calc_global_load_remove(struct rq *rq)
7163{
7164 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7165}
6973#endif /* CONFIG_HOTPLUG_CPU */ 7166#endif /* CONFIG_HOTPLUG_CPU */
6974 7167
6975#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7168#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7397,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7204 /* Update our root-domain */ 7397 /* Update our root-domain */
7205 rq = cpu_rq(cpu); 7398 rq = cpu_rq(cpu);
7206 spin_lock_irqsave(&rq->lock, flags); 7399 spin_lock_irqsave(&rq->lock, flags);
7400 rq->calc_load_update = calc_load_update;
7401 rq->calc_load_active = 0;
7207 if (rq->rd) { 7402 if (rq->rd) {
7208 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7403 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7209 7404
@@ -7243,7 +7438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7243 cpuset_unlock(); 7438 cpuset_unlock();
7244 migrate_nr_uninterruptible(rq); 7439 migrate_nr_uninterruptible(rq);
7245 BUG_ON(rq->nr_running != 0); 7440 BUG_ON(rq->nr_running != 0);
7246 7441 calc_global_load_remove(rq);
7247 /* 7442 /*
7248 * No need to migrate the tasks: it was best-effort if 7443 * No need to migrate the tasks: it was best-effort if
7249 * they didn't take sched_hotcpu_mutex. Just wake up 7444 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7753,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7753 7948
7754/* 7949/*
7755 * The cpus mask in sched_group and sched_domain hangs off the end. 7950 * The cpus mask in sched_group and sched_domain hangs off the end.
7756 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 7951 *
7757 * for nr_cpu_ids < CONFIG_NR_CPUS. 7952 * ( See the the comments in include/linux/sched.h:struct sched_group
7953 * and struct sched_domain. )
7758 */ 7954 */
7759struct static_sched_group { 7955struct static_sched_group {
7760 struct sched_group sg; 7956 struct sched_group sg;
@@ -7875,7 +8071,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7875 struct sched_domain *sd; 8071 struct sched_domain *sd;
7876 8072
7877 sd = &per_cpu(phys_domains, j).sd; 8073 sd = &per_cpu(phys_domains, j).sd;
7878 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8074 if (j != group_first_cpu(sd->groups)) {
7879 /* 8075 /*
7880 * Only add "power" once for each 8076 * Only add "power" once for each
7881 * physical package. 8077 * physical package.
@@ -7953,7 +8149,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7953 8149
7954 WARN_ON(!sd || !sd->groups); 8150 WARN_ON(!sd || !sd->groups);
7955 8151
7956 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8152 if (cpu != group_first_cpu(sd->groups))
7957 return; 8153 return;
7958 8154
7959 child = sd->child; 8155 child = sd->child;
@@ -8938,6 +9134,8 @@ void __init sched_init(void)
8938 rq = cpu_rq(i); 9134 rq = cpu_rq(i);
8939 spin_lock_init(&rq->lock); 9135 spin_lock_init(&rq->lock);
8940 rq->nr_running = 0; 9136 rq->nr_running = 0;
9137 rq->calc_load_active = 0;
9138 rq->calc_load_update = jiffies + LOAD_FREQ;
8941 init_cfs_rq(&rq->cfs, rq); 9139 init_cfs_rq(&rq->cfs, rq);
8942 init_rt_rq(&rq->rt, rq); 9140 init_rt_rq(&rq->rt, rq);
8943#ifdef CONFIG_FAIR_GROUP_SCHED 9141#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9045,6 +9243,9 @@ void __init sched_init(void)
9045 * when this runqueue becomes "idle". 9243 * when this runqueue becomes "idle".
9046 */ 9244 */
9047 init_idle(current, smp_processor_id()); 9245 init_idle(current, smp_processor_id());
9246
9247 calc_load_update = jiffies + LOAD_FREQ;
9248
9048 /* 9249 /*
9049 * During early bootup we pretend to be a normal task: 9250 * During early bootup we pretend to be a normal task:
9050 */ 9251 */
@@ -9055,6 +9256,7 @@ void __init sched_init(void)
9055#ifdef CONFIG_SMP 9256#ifdef CONFIG_SMP
9056#ifdef CONFIG_NO_HZ 9257#ifdef CONFIG_NO_HZ
9057 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9258 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
9259 alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
9058#endif 9260#endif
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9261 alloc_bootmem_cpumask_var(&cpu_isolated_map);
9060#endif /* SMP */ 9262#endif /* SMP */
@@ -9800,6 +10002,13 @@ static int sched_rt_global_constraints(void)
9800 if (sysctl_sched_rt_period <= 0) 10002 if (sysctl_sched_rt_period <= 0)
9801 return -EINVAL; 10003 return -EINVAL;
9802 10004
10005 /*
10006 * There's always some RT tasks in the root group
10007 * -- migration, kstopmachine etc..
10008 */
10009 if (sysctl_sched_rt_runtime == 0)
10010 return -EBUSY;
10011
9803 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10012 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9804 for_each_possible_cpu(i) { 10013 for_each_possible_cpu(i) {
9805 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10014 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f119..5f9650e8fe75 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1487 1487
1488 find_matching_se(&se, &pse); 1488 find_matching_se(&se, &pse);
1489 1489
1490 while (se) { 1490 BUG_ON(!pse);
1491 BUG_ON(!pse);
1492 1491
1493 if (wakeup_preempt_entity(se, pse) == 1) { 1492 if (wakeup_preempt_entity(se, pse) == 1)
1494 resched_task(curr); 1493 resched_task(curr);
1495 break;
1496 }
1497
1498 se = parent_entity(se);
1499 pse = parent_entity(pse);
1500 }
1501} 1494}
1502 1495
1503static struct task_struct *pick_next_task_fair(struct rq *rq) 1496static struct task_struct *pick_next_task_fair(struct rq *rq)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13..499672c10cbd 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
22static struct task_struct *pick_next_task_idle(struct rq *rq) 22static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 23{
24 schedstat_inc(rq, sched_goidle); 24 schedstat_inc(rq, sched_goidle);
25 25 /* adjust the active tasks as we might go into a long sleep */
26 calc_load_account_active(rq);
26 return rq->idle; 27 return rq->idle;
27} 28}
28 29
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7..52a8bf8931f3 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
22 22
23/* 23/*
24 * This read-write spinlock protects us from races in SMP while 24 * This read-write spinlock protects us from races in SMP while
25 * playing with xtime and avenrun. 25 * playing with xtime.
26 */ 26 */
27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); 27__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
28 28
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..a26ed294f938 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1123,47 +1123,6 @@ void update_process_times(int user_tick)
1123} 1123}
1124 1124
1125/* 1125/*
1126 * Nr of active tasks - counted in fixed-point numbers
1127 */
1128static unsigned long count_active_tasks(void)
1129{
1130 return nr_active() * FIXED_1;
1131}
1132
1133/*
1134 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
1135 * imply that avenrun[] is the standard name for this kind of thing.
1136 * Nothing else seems to be standardized: the fractional size etc
1137 * all seem to differ on different machines.
1138 *
1139 * Requires xtime_lock to access.
1140 */
1141unsigned long avenrun[3];
1142
1143EXPORT_SYMBOL(avenrun);
1144
1145/*
1146 * calc_load - given tick count, update the avenrun load estimates.
1147 * This is called while holding a write_lock on xtime_lock.
1148 */
1149static inline void calc_load(unsigned long ticks)
1150{
1151 unsigned long active_tasks; /* fixed-point */
1152 static int count = LOAD_FREQ;
1153
1154 count -= ticks;
1155 if (unlikely(count < 0)) {
1156 active_tasks = count_active_tasks();
1157 do {
1158 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
1159 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
1160 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
1161 count += LOAD_FREQ;
1162 } while (count < 0);
1163 }
1164}
1165
1166/*
1167 * This function runs timers and the timer-tq in bottom half context. 1126 * This function runs timers and the timer-tq in bottom half context.
1168 */ 1127 */
1169static void run_timer_softirq(struct softirq_action *h) 1128static void run_timer_softirq(struct softirq_action *h)
@@ -1187,16 +1146,6 @@ void run_local_timers(void)
1187} 1146}
1188 1147
1189/* 1148/*
1190 * Called by the timer interrupt. xtime_lock must already be taken
1191 * by the timer IRQ!
1192 */
1193static inline void update_times(unsigned long ticks)
1194{
1195 update_wall_time();
1196 calc_load(ticks);
1197}
1198
1199/*
1200 * The 64-bit jiffies value is not atomic - you MUST NOT read it 1149 * The 64-bit jiffies value is not atomic - you MUST NOT read it
1201 * without sampling the sequence number in xtime_lock. 1150 * without sampling the sequence number in xtime_lock.
1202 * jiffies is defined in the linker script... 1151 * jiffies is defined in the linker script...
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks)
1205void do_timer(unsigned long ticks) 1154void do_timer(unsigned long ticks)
1206{ 1155{
1207 jiffies_64 += ticks; 1156 jiffies_64 += ticks;
1208 update_times(ticks); 1157 update_wall_time();
1158 calc_global_load();
1209} 1159}
1210 1160
1211#ifdef __ARCH_WANT_SYS_ALARM 1161#ifdef __ARCH_WANT_SYS_ALARM
@@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info)
1406{ 1356{
1407 unsigned long mem_total, sav_total; 1357 unsigned long mem_total, sav_total;
1408 unsigned int mem_unit, bitcount; 1358 unsigned int mem_unit, bitcount;
1409 unsigned long seq; 1359 struct timespec tp;
1410 1360
1411 memset(info, 0, sizeof(struct sysinfo)); 1361 memset(info, 0, sizeof(struct sysinfo));
1412 1362
1413 do { 1363 ktime_get_ts(&tp);
1414 struct timespec tp; 1364 monotonic_to_bootbased(&tp);
1415 seq = read_seqbegin(&xtime_lock); 1365 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1416
1417 /*
1418 * This is annoying. The below is the same thing
1419 * posix_get_clock_monotonic() does, but it wants to
1420 * take the lock which we want to cover the loads stuff
1421 * too.
1422 */
1423
1424 getnstimeofday(&tp);
1425 tp.tv_sec += wall_to_monotonic.tv_sec;
1426 tp.tv_nsec += wall_to_monotonic.tv_nsec;
1427 monotonic_to_bootbased(&tp);
1428 if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1429 tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1430 tp.tv_sec++;
1431 }
1432 info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1433 1366
1434 info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); 1367 get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
1435 info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1436 info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1437 1368
1438 info->procs = nr_threads; 1369 info->procs = nr_threads;
1439 } while (read_seqretry(&xtime_lock, seq));
1440 1370
1441 si_meminfo(info); 1371 si_meminfo(info);
1442 si_swapinfo(info); 1372 si_swapinfo(info);
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c8..ea7c3b4275cf 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
154 if (!list_empty(&wait->task_list)) 154 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list); 155 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q)) 156 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key); 157 __wake_up_locked_key(q, mode, key);
158 spin_unlock_irqrestore(&q->lock, flags); 158 spin_unlock_irqrestore(&q->lock, flags);
159} 159}
160EXPORT_SYMBOL(abort_exclusive_wait); 160EXPORT_SYMBOL(abort_exclusive_wait);