diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-10 18:32:59 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-10 18:32:59 -0400 |
commit | 99e97b860e14c64760855198e91d1166697131a7 (patch) | |
tree | fadc8368c3f784bff92fba82d983e7861559cf9d /kernel | |
parent | 82782ca77d1bfb32b0334cce40a25b91bd8ec016 (diff) | |
parent | f04d82b7e0c63d0251f9952a537a4bc4d73aa1a9 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
sched: fix typo in sched-rt-group.txt file
ftrace: fix typo about map of kernel priority in ftrace.txt file.
sched: properly define the sched_group::cpumask and sched_domain::span fields
sched, timers: cleanup avenrun users
sched, timers: move calc_load() to scheduler
sched: Don't export sched_mc_power_savings on multi-socket single core system
sched: emit thread info flags with stack trace
sched: rt: document the risk of small values in the bandwidth settings
sched: Replace first_cpu() with cpumask_first() in ILB nomination code
sched: remove extra call overhead for schedule()
sched: use group_first_cpu() instead of cpumask_first(sched_group_cpus())
wait: don't use __wake_up_common()
sched: Nominate a power-efficient ilb in select_nohz_balancer()
sched: Nominate idle load balancer from a semi-idle package.
sched: remove redundant hierarchy walk in check_preempt_wakeup
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/mutex.c | 4 | ||||
-rw-r--r-- | kernel/sched.c | 279 | ||||
-rw-r--r-- | kernel/sched_fair.c | 13 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 3 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 2 | ||||
-rw-r--r-- | kernel/timer.c | 86 | ||||
-rw-r--r-- | kernel/wait.c | 2 |
7 files changed, 262 insertions, 127 deletions
diff --git a/kernel/mutex.c b/kernel/mutex.c index 507cf2b5e9f1..6ca5fe96e393 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
249 | 249 | ||
250 | /* didnt get the lock, go to sleep: */ | 250 | /* didnt get the lock, go to sleep: */ |
251 | spin_unlock_mutex(&lock->wait_lock, flags); | 251 | spin_unlock_mutex(&lock->wait_lock, flags); |
252 | __schedule(); | 252 | preempt_enable_no_resched(); |
253 | schedule(); | ||
254 | preempt_disable(); | ||
253 | spin_lock_mutex(&lock->wait_lock, flags); | 255 | spin_lock_mutex(&lock->wait_lock, flags); |
254 | } | 256 | } |
255 | 257 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 26efa475bdc1..228acae8821f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -630,6 +630,10 @@ struct rq { | |||
630 | struct list_head migration_queue; | 630 | struct list_head migration_queue; |
631 | #endif | 631 | #endif |
632 | 632 | ||
633 | /* calc_load related fields */ | ||
634 | unsigned long calc_load_update; | ||
635 | long calc_load_active; | ||
636 | |||
633 | #ifdef CONFIG_SCHED_HRTICK | 637 | #ifdef CONFIG_SCHED_HRTICK |
634 | #ifdef CONFIG_SMP | 638 | #ifdef CONFIG_SMP |
635 | int hrtick_csd_pending; | 639 | int hrtick_csd_pending; |
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1728 | } | 1732 | } |
1729 | #endif | 1733 | #endif |
1730 | 1734 | ||
1735 | static void calc_load_account_active(struct rq *this_rq); | ||
1736 | |||
1731 | #include "sched_stats.h" | 1737 | #include "sched_stats.h" |
1732 | #include "sched_idletask.c" | 1738 | #include "sched_idletask.c" |
1733 | #include "sched_fair.c" | 1739 | #include "sched_fair.c" |
@@ -2856,19 +2862,72 @@ unsigned long nr_iowait(void) | |||
2856 | return sum; | 2862 | return sum; |
2857 | } | 2863 | } |
2858 | 2864 | ||
2859 | unsigned long nr_active(void) | 2865 | /* Variables and functions for calc_load */ |
2866 | static atomic_long_t calc_load_tasks; | ||
2867 | static unsigned long calc_load_update; | ||
2868 | unsigned long avenrun[3]; | ||
2869 | EXPORT_SYMBOL(avenrun); | ||
2870 | |||
2871 | /** | ||
2872 | * get_avenrun - get the load average array | ||
2873 | * @loads: pointer to dest load array | ||
2874 | * @offset: offset to add | ||
2875 | * @shift: shift count to shift the result left | ||
2876 | * | ||
2877 | * These values are estimates at best, so no need for locking. | ||
2878 | */ | ||
2879 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2860 | { | 2880 | { |
2861 | unsigned long i, running = 0, uninterruptible = 0; | 2881 | loads[0] = (avenrun[0] + offset) << shift; |
2882 | loads[1] = (avenrun[1] + offset) << shift; | ||
2883 | loads[2] = (avenrun[2] + offset) << shift; | ||
2884 | } | ||
2862 | 2885 | ||
2863 | for_each_online_cpu(i) { | 2886 | static unsigned long |
2864 | running += cpu_rq(i)->nr_running; | 2887 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
2865 | uninterruptible += cpu_rq(i)->nr_uninterruptible; | 2888 | { |
2866 | } | 2889 | load *= exp; |
2890 | load += active * (FIXED_1 - exp); | ||
2891 | return load >> FSHIFT; | ||
2892 | } | ||
2867 | 2893 | ||
2868 | if (unlikely((long)uninterruptible < 0)) | 2894 | /* |
2869 | uninterruptible = 0; | 2895 | * calc_load - update the avenrun load estimates 10 ticks after the |
2896 | * CPUs have updated calc_load_tasks. | ||
2897 | */ | ||
2898 | void calc_global_load(void) | ||
2899 | { | ||
2900 | unsigned long upd = calc_load_update + 10; | ||
2901 | long active; | ||
2870 | 2902 | ||
2871 | return running + uninterruptible; | 2903 | if (time_before(jiffies, upd)) |
2904 | return; | ||
2905 | |||
2906 | active = atomic_long_read(&calc_load_tasks); | ||
2907 | active = active > 0 ? active * FIXED_1 : 0; | ||
2908 | |||
2909 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | ||
2910 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
2911 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
2912 | |||
2913 | calc_load_update += LOAD_FREQ; | ||
2914 | } | ||
2915 | |||
2916 | /* | ||
2917 | * Either called from update_cpu_load() or from a cpu going idle | ||
2918 | */ | ||
2919 | static void calc_load_account_active(struct rq *this_rq) | ||
2920 | { | ||
2921 | long nr_active, delta; | ||
2922 | |||
2923 | nr_active = this_rq->nr_running; | ||
2924 | nr_active += (long) this_rq->nr_uninterruptible; | ||
2925 | |||
2926 | if (nr_active != this_rq->calc_load_active) { | ||
2927 | delta = nr_active - this_rq->calc_load_active; | ||
2928 | this_rq->calc_load_active = nr_active; | ||
2929 | atomic_long_add(delta, &calc_load_tasks); | ||
2930 | } | ||
2872 | } | 2931 | } |
2873 | 2932 | ||
2874 | /* | 2933 | /* |
@@ -2899,6 +2958,11 @@ static void update_cpu_load(struct rq *this_rq) | |||
2899 | new_load += scale-1; | 2958 | new_load += scale-1; |
2900 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2959 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2901 | } | 2960 | } |
2961 | |||
2962 | if (time_after_eq(jiffies, this_rq->calc_load_update)) { | ||
2963 | this_rq->calc_load_update += LOAD_FREQ; | ||
2964 | calc_load_account_active(this_rq); | ||
2965 | } | ||
2902 | } | 2966 | } |
2903 | 2967 | ||
2904 | #ifdef CONFIG_SMP | 2968 | #ifdef CONFIG_SMP |
@@ -4240,10 +4304,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
4240 | static struct { | 4304 | static struct { |
4241 | atomic_t load_balancer; | 4305 | atomic_t load_balancer; |
4242 | cpumask_var_t cpu_mask; | 4306 | cpumask_var_t cpu_mask; |
4307 | cpumask_var_t ilb_grp_nohz_mask; | ||
4243 | } nohz ____cacheline_aligned = { | 4308 | } nohz ____cacheline_aligned = { |
4244 | .load_balancer = ATOMIC_INIT(-1), | 4309 | .load_balancer = ATOMIC_INIT(-1), |
4245 | }; | 4310 | }; |
4246 | 4311 | ||
4312 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
4313 | /** | ||
4314 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4315 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4316 | * be returned. | ||
4317 | * @flag: The flag to check for the lowest sched_domain | ||
4318 | * for the given cpu. | ||
4319 | * | ||
4320 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4321 | */ | ||
4322 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4323 | { | ||
4324 | struct sched_domain *sd; | ||
4325 | |||
4326 | for_each_domain(cpu, sd) | ||
4327 | if (sd && (sd->flags & flag)) | ||
4328 | break; | ||
4329 | |||
4330 | return sd; | ||
4331 | } | ||
4332 | |||
4333 | /** | ||
4334 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4335 | * @cpu: The cpu whose domains we're iterating over. | ||
4336 | * @sd: variable holding the value of the power_savings_sd | ||
4337 | * for cpu. | ||
4338 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4339 | * | ||
4340 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4341 | * set, starting from the lowest sched_domain to the highest. | ||
4342 | */ | ||
4343 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4344 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4345 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4346 | |||
4347 | /** | ||
4348 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
4349 | * @ilb_group: group to be checked for semi-idleness | ||
4350 | * | ||
4351 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
4352 | * | ||
4353 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
4354 | * and atleast one non-idle CPU. This helper function checks if the given | ||
4355 | * sched_group is semi-idle or not. | ||
4356 | */ | ||
4357 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
4358 | { | ||
4359 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
4360 | sched_group_cpus(ilb_group)); | ||
4361 | |||
4362 | /* | ||
4363 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
4364 | * and atleast one idle cpu. | ||
4365 | */ | ||
4366 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
4367 | return 0; | ||
4368 | |||
4369 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
4370 | return 0; | ||
4371 | |||
4372 | return 1; | ||
4373 | } | ||
4374 | /** | ||
4375 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4376 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4377 | * | ||
4378 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4379 | * Else, returns >= nr_cpu_ids. | ||
4380 | * | ||
4381 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4382 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4383 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4384 | * when there are other idle cpu's which are better suited for that job. | ||
4385 | */ | ||
4386 | static int find_new_ilb(int cpu) | ||
4387 | { | ||
4388 | struct sched_domain *sd; | ||
4389 | struct sched_group *ilb_group; | ||
4390 | |||
4391 | /* | ||
4392 | * Have idle load balancer selection from semi-idle packages only | ||
4393 | * when power-aware load balancing is enabled | ||
4394 | */ | ||
4395 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4396 | goto out_done; | ||
4397 | |||
4398 | /* | ||
4399 | * Optimize for the case when we have no idle CPUs or only one | ||
4400 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4401 | */ | ||
4402 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
4403 | goto out_done; | ||
4404 | |||
4405 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4406 | ilb_group = sd->groups; | ||
4407 | |||
4408 | do { | ||
4409 | if (is_semi_idle_group(ilb_group)) | ||
4410 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
4411 | |||
4412 | ilb_group = ilb_group->next; | ||
4413 | |||
4414 | } while (ilb_group != sd->groups); | ||
4415 | } | ||
4416 | |||
4417 | out_done: | ||
4418 | return cpumask_first(nohz.cpu_mask); | ||
4419 | } | ||
4420 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4421 | static inline int find_new_ilb(int call_cpu) | ||
4422 | { | ||
4423 | return cpumask_first(nohz.cpu_mask); | ||
4424 | } | ||
4425 | #endif | ||
4426 | |||
4247 | /* | 4427 | /* |
4248 | * This routine will try to nominate the ilb (idle load balancing) | 4428 | * This routine will try to nominate the ilb (idle load balancing) |
4249 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 4429 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
@@ -4298,8 +4478,24 @@ int select_nohz_load_balancer(int stop_tick) | |||
4298 | /* make me the ilb owner */ | 4478 | /* make me the ilb owner */ |
4299 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | 4479 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) |
4300 | return 1; | 4480 | return 1; |
4301 | } else if (atomic_read(&nohz.load_balancer) == cpu) | 4481 | } else if (atomic_read(&nohz.load_balancer) == cpu) { |
4482 | int new_ilb; | ||
4483 | |||
4484 | if (!(sched_smt_power_savings || | ||
4485 | sched_mc_power_savings)) | ||
4486 | return 1; | ||
4487 | /* | ||
4488 | * Check to see if there is a more power-efficient | ||
4489 | * ilb. | ||
4490 | */ | ||
4491 | new_ilb = find_new_ilb(cpu); | ||
4492 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
4493 | atomic_set(&nohz.load_balancer, -1); | ||
4494 | resched_cpu(new_ilb); | ||
4495 | return 0; | ||
4496 | } | ||
4302 | return 1; | 4497 | return 1; |
4498 | } | ||
4303 | } else { | 4499 | } else { |
4304 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | 4500 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) |
4305 | return 0; | 4501 | return 0; |
@@ -4468,15 +4664,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) | |||
4468 | } | 4664 | } |
4469 | 4665 | ||
4470 | if (atomic_read(&nohz.load_balancer) == -1) { | 4666 | if (atomic_read(&nohz.load_balancer) == -1) { |
4471 | /* | 4667 | int ilb = find_new_ilb(cpu); |
4472 | * simple selection for now: Nominate the | ||
4473 | * first cpu in the nohz list to be the next | ||
4474 | * ilb owner. | ||
4475 | * | ||
4476 | * TBD: Traverse the sched domains and nominate | ||
4477 | * the nearest cpu in the nohz.cpu_mask. | ||
4478 | */ | ||
4479 | int ilb = cpumask_first(nohz.cpu_mask); | ||
4480 | 4668 | ||
4481 | if (ilb < nr_cpu_ids) | 4669 | if (ilb < nr_cpu_ids) |
4482 | resched_cpu(ilb); | 4670 | resched_cpu(ilb); |
@@ -5007,13 +5195,15 @@ pick_next_task(struct rq *rq) | |||
5007 | /* | 5195 | /* |
5008 | * schedule() is the main scheduler function. | 5196 | * schedule() is the main scheduler function. |
5009 | */ | 5197 | */ |
5010 | asmlinkage void __sched __schedule(void) | 5198 | asmlinkage void __sched schedule(void) |
5011 | { | 5199 | { |
5012 | struct task_struct *prev, *next; | 5200 | struct task_struct *prev, *next; |
5013 | unsigned long *switch_count; | 5201 | unsigned long *switch_count; |
5014 | struct rq *rq; | 5202 | struct rq *rq; |
5015 | int cpu; | 5203 | int cpu; |
5016 | 5204 | ||
5205 | need_resched: | ||
5206 | preempt_disable(); | ||
5017 | cpu = smp_processor_id(); | 5207 | cpu = smp_processor_id(); |
5018 | rq = cpu_rq(cpu); | 5208 | rq = cpu_rq(cpu); |
5019 | rcu_qsctr_inc(cpu); | 5209 | rcu_qsctr_inc(cpu); |
@@ -5070,15 +5260,9 @@ need_resched_nonpreemptible: | |||
5070 | 5260 | ||
5071 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5261 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
5072 | goto need_resched_nonpreemptible; | 5262 | goto need_resched_nonpreemptible; |
5073 | } | ||
5074 | 5263 | ||
5075 | asmlinkage void __sched schedule(void) | ||
5076 | { | ||
5077 | need_resched: | ||
5078 | preempt_disable(); | ||
5079 | __schedule(); | ||
5080 | preempt_enable_no_resched(); | 5264 | preempt_enable_no_resched(); |
5081 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 5265 | if (need_resched()) |
5082 | goto need_resched; | 5266 | goto need_resched; |
5083 | } | 5267 | } |
5084 | EXPORT_SYMBOL(schedule); | 5268 | EXPORT_SYMBOL(schedule); |
@@ -5221,7 +5405,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
5221 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 5405 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
5222 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 5406 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
5223 | */ | 5407 | */ |
5224 | void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 5408 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
5225 | int nr_exclusive, int sync, void *key) | 5409 | int nr_exclusive, int sync, void *key) |
5226 | { | 5410 | { |
5227 | wait_queue_t *curr, *next; | 5411 | wait_queue_t *curr, *next; |
@@ -6490,8 +6674,9 @@ void sched_show_task(struct task_struct *p) | |||
6490 | #ifdef CONFIG_DEBUG_STACK_USAGE | 6674 | #ifdef CONFIG_DEBUG_STACK_USAGE |
6491 | free = stack_not_used(p); | 6675 | free = stack_not_used(p); |
6492 | #endif | 6676 | #endif |
6493 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 6677 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
6494 | task_pid_nr(p), task_pid_nr(p->real_parent)); | 6678 | task_pid_nr(p), task_pid_nr(p->real_parent), |
6679 | (unsigned long)task_thread_info(p)->flags); | ||
6495 | 6680 | ||
6496 | show_stack(p, NULL); | 6681 | show_stack(p, NULL); |
6497 | } | 6682 | } |
@@ -6970,6 +7155,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
6970 | 7155 | ||
6971 | } | 7156 | } |
6972 | } | 7157 | } |
7158 | |||
7159 | /* | ||
7160 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
7161 | */ | ||
7162 | static void calc_global_load_remove(struct rq *rq) | ||
7163 | { | ||
7164 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
7165 | } | ||
6973 | #endif /* CONFIG_HOTPLUG_CPU */ | 7166 | #endif /* CONFIG_HOTPLUG_CPU */ |
6974 | 7167 | ||
6975 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 7168 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -7204,6 +7397,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7204 | /* Update our root-domain */ | 7397 | /* Update our root-domain */ |
7205 | rq = cpu_rq(cpu); | 7398 | rq = cpu_rq(cpu); |
7206 | spin_lock_irqsave(&rq->lock, flags); | 7399 | spin_lock_irqsave(&rq->lock, flags); |
7400 | rq->calc_load_update = calc_load_update; | ||
7401 | rq->calc_load_active = 0; | ||
7207 | if (rq->rd) { | 7402 | if (rq->rd) { |
7208 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 7403 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
7209 | 7404 | ||
@@ -7243,7 +7438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7243 | cpuset_unlock(); | 7438 | cpuset_unlock(); |
7244 | migrate_nr_uninterruptible(rq); | 7439 | migrate_nr_uninterruptible(rq); |
7245 | BUG_ON(rq->nr_running != 0); | 7440 | BUG_ON(rq->nr_running != 0); |
7246 | 7441 | calc_global_load_remove(rq); | |
7247 | /* | 7442 | /* |
7248 | * No need to migrate the tasks: it was best-effort if | 7443 | * No need to migrate the tasks: it was best-effort if |
7249 | * they didn't take sched_hotcpu_mutex. Just wake up | 7444 | * they didn't take sched_hotcpu_mutex. Just wake up |
@@ -7753,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | |||
7753 | 7948 | ||
7754 | /* | 7949 | /* |
7755 | * The cpus mask in sched_group and sched_domain hangs off the end. | 7950 | * The cpus mask in sched_group and sched_domain hangs off the end. |
7756 | * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space | 7951 | * |
7757 | * for nr_cpu_ids < CONFIG_NR_CPUS. | 7952 | * ( See the the comments in include/linux/sched.h:struct sched_group |
7953 | * and struct sched_domain. ) | ||
7758 | */ | 7954 | */ |
7759 | struct static_sched_group { | 7955 | struct static_sched_group { |
7760 | struct sched_group sg; | 7956 | struct sched_group sg; |
@@ -7875,7 +8071,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
7875 | struct sched_domain *sd; | 8071 | struct sched_domain *sd; |
7876 | 8072 | ||
7877 | sd = &per_cpu(phys_domains, j).sd; | 8073 | sd = &per_cpu(phys_domains, j).sd; |
7878 | if (j != cpumask_first(sched_group_cpus(sd->groups))) { | 8074 | if (j != group_first_cpu(sd->groups)) { |
7879 | /* | 8075 | /* |
7880 | * Only add "power" once for each | 8076 | * Only add "power" once for each |
7881 | * physical package. | 8077 | * physical package. |
@@ -7953,7 +8149,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7953 | 8149 | ||
7954 | WARN_ON(!sd || !sd->groups); | 8150 | WARN_ON(!sd || !sd->groups); |
7955 | 8151 | ||
7956 | if (cpu != cpumask_first(sched_group_cpus(sd->groups))) | 8152 | if (cpu != group_first_cpu(sd->groups)) |
7957 | return; | 8153 | return; |
7958 | 8154 | ||
7959 | child = sd->child; | 8155 | child = sd->child; |
@@ -8938,6 +9134,8 @@ void __init sched_init(void) | |||
8938 | rq = cpu_rq(i); | 9134 | rq = cpu_rq(i); |
8939 | spin_lock_init(&rq->lock); | 9135 | spin_lock_init(&rq->lock); |
8940 | rq->nr_running = 0; | 9136 | rq->nr_running = 0; |
9137 | rq->calc_load_active = 0; | ||
9138 | rq->calc_load_update = jiffies + LOAD_FREQ; | ||
8941 | init_cfs_rq(&rq->cfs, rq); | 9139 | init_cfs_rq(&rq->cfs, rq); |
8942 | init_rt_rq(&rq->rt, rq); | 9140 | init_rt_rq(&rq->rt, rq); |
8943 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9141 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -9045,6 +9243,9 @@ void __init sched_init(void) | |||
9045 | * when this runqueue becomes "idle". | 9243 | * when this runqueue becomes "idle". |
9046 | */ | 9244 | */ |
9047 | init_idle(current, smp_processor_id()); | 9245 | init_idle(current, smp_processor_id()); |
9246 | |||
9247 | calc_load_update = jiffies + LOAD_FREQ; | ||
9248 | |||
9048 | /* | 9249 | /* |
9049 | * During early bootup we pretend to be a normal task: | 9250 | * During early bootup we pretend to be a normal task: |
9050 | */ | 9251 | */ |
@@ -9055,6 +9256,7 @@ void __init sched_init(void) | |||
9055 | #ifdef CONFIG_SMP | 9256 | #ifdef CONFIG_SMP |
9056 | #ifdef CONFIG_NO_HZ | 9257 | #ifdef CONFIG_NO_HZ |
9057 | alloc_bootmem_cpumask_var(&nohz.cpu_mask); | 9258 | alloc_bootmem_cpumask_var(&nohz.cpu_mask); |
9259 | alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); | ||
9058 | #endif | 9260 | #endif |
9059 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | 9261 | alloc_bootmem_cpumask_var(&cpu_isolated_map); |
9060 | #endif /* SMP */ | 9262 | #endif /* SMP */ |
@@ -9800,6 +10002,13 @@ static int sched_rt_global_constraints(void) | |||
9800 | if (sysctl_sched_rt_period <= 0) | 10002 | if (sysctl_sched_rt_period <= 0) |
9801 | return -EINVAL; | 10003 | return -EINVAL; |
9802 | 10004 | ||
10005 | /* | ||
10006 | * There's always some RT tasks in the root group | ||
10007 | * -- migration, kstopmachine etc.. | ||
10008 | */ | ||
10009 | if (sysctl_sched_rt_runtime == 0) | ||
10010 | return -EBUSY; | ||
10011 | |||
9803 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 10012 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
9804 | for_each_possible_cpu(i) { | 10013 | for_each_possible_cpu(i) { |
9805 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | 10014 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3816f217f119..5f9650e8fe75 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) | |||
1487 | 1487 | ||
1488 | find_matching_se(&se, &pse); | 1488 | find_matching_se(&se, &pse); |
1489 | 1489 | ||
1490 | while (se) { | 1490 | BUG_ON(!pse); |
1491 | BUG_ON(!pse); | ||
1492 | 1491 | ||
1493 | if (wakeup_preempt_entity(se, pse) == 1) { | 1492 | if (wakeup_preempt_entity(se, pse) == 1) |
1494 | resched_task(curr); | 1493 | resched_task(curr); |
1495 | break; | ||
1496 | } | ||
1497 | |||
1498 | se = parent_entity(se); | ||
1499 | pse = parent_entity(pse); | ||
1500 | } | ||
1501 | } | 1494 | } |
1502 | 1495 | ||
1503 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 1496 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e28c13..499672c10cbd 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy | |||
22 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 22 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
23 | { | 23 | { |
24 | schedstat_inc(rq, sched_goidle); | 24 | schedstat_inc(rq, sched_goidle); |
25 | 25 | /* adjust the active tasks as we might go into a long sleep */ | |
26 | calc_load_account_active(rq); | ||
26 | return rq->idle; | 27 | return rq->idle; |
27 | } | 28 | } |
28 | 29 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e7..52a8bf8931f3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -22,7 +22,7 @@ | |||
22 | 22 | ||
23 | /* | 23 | /* |
24 | * This read-write spinlock protects us from races in SMP while | 24 | * This read-write spinlock protects us from races in SMP while |
25 | * playing with xtime and avenrun. | 25 | * playing with xtime. |
26 | */ | 26 | */ |
27 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); | 27 | __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); |
28 | 28 | ||
diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c31..a26ed294f938 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -1123,47 +1123,6 @@ void update_process_times(int user_tick) | |||
1123 | } | 1123 | } |
1124 | 1124 | ||
1125 | /* | 1125 | /* |
1126 | * Nr of active tasks - counted in fixed-point numbers | ||
1127 | */ | ||
1128 | static unsigned long count_active_tasks(void) | ||
1129 | { | ||
1130 | return nr_active() * FIXED_1; | ||
1131 | } | ||
1132 | |||
1133 | /* | ||
1134 | * Hmm.. Changed this, as the GNU make sources (load.c) seems to | ||
1135 | * imply that avenrun[] is the standard name for this kind of thing. | ||
1136 | * Nothing else seems to be standardized: the fractional size etc | ||
1137 | * all seem to differ on different machines. | ||
1138 | * | ||
1139 | * Requires xtime_lock to access. | ||
1140 | */ | ||
1141 | unsigned long avenrun[3]; | ||
1142 | |||
1143 | EXPORT_SYMBOL(avenrun); | ||
1144 | |||
1145 | /* | ||
1146 | * calc_load - given tick count, update the avenrun load estimates. | ||
1147 | * This is called while holding a write_lock on xtime_lock. | ||
1148 | */ | ||
1149 | static inline void calc_load(unsigned long ticks) | ||
1150 | { | ||
1151 | unsigned long active_tasks; /* fixed-point */ | ||
1152 | static int count = LOAD_FREQ; | ||
1153 | |||
1154 | count -= ticks; | ||
1155 | if (unlikely(count < 0)) { | ||
1156 | active_tasks = count_active_tasks(); | ||
1157 | do { | ||
1158 | CALC_LOAD(avenrun[0], EXP_1, active_tasks); | ||
1159 | CALC_LOAD(avenrun[1], EXP_5, active_tasks); | ||
1160 | CALC_LOAD(avenrun[2], EXP_15, active_tasks); | ||
1161 | count += LOAD_FREQ; | ||
1162 | } while (count < 0); | ||
1163 | } | ||
1164 | } | ||
1165 | |||
1166 | /* | ||
1167 | * This function runs timers and the timer-tq in bottom half context. | 1126 | * This function runs timers and the timer-tq in bottom half context. |
1168 | */ | 1127 | */ |
1169 | static void run_timer_softirq(struct softirq_action *h) | 1128 | static void run_timer_softirq(struct softirq_action *h) |
@@ -1187,16 +1146,6 @@ void run_local_timers(void) | |||
1187 | } | 1146 | } |
1188 | 1147 | ||
1189 | /* | 1148 | /* |
1190 | * Called by the timer interrupt. xtime_lock must already be taken | ||
1191 | * by the timer IRQ! | ||
1192 | */ | ||
1193 | static inline void update_times(unsigned long ticks) | ||
1194 | { | ||
1195 | update_wall_time(); | ||
1196 | calc_load(ticks); | ||
1197 | } | ||
1198 | |||
1199 | /* | ||
1200 | * The 64-bit jiffies value is not atomic - you MUST NOT read it | 1149 | * The 64-bit jiffies value is not atomic - you MUST NOT read it |
1201 | * without sampling the sequence number in xtime_lock. | 1150 | * without sampling the sequence number in xtime_lock. |
1202 | * jiffies is defined in the linker script... | 1151 | * jiffies is defined in the linker script... |
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks) | |||
1205 | void do_timer(unsigned long ticks) | 1154 | void do_timer(unsigned long ticks) |
1206 | { | 1155 | { |
1207 | jiffies_64 += ticks; | 1156 | jiffies_64 += ticks; |
1208 | update_times(ticks); | 1157 | update_wall_time(); |
1158 | calc_global_load(); | ||
1209 | } | 1159 | } |
1210 | 1160 | ||
1211 | #ifdef __ARCH_WANT_SYS_ALARM | 1161 | #ifdef __ARCH_WANT_SYS_ALARM |
@@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info) | |||
1406 | { | 1356 | { |
1407 | unsigned long mem_total, sav_total; | 1357 | unsigned long mem_total, sav_total; |
1408 | unsigned int mem_unit, bitcount; | 1358 | unsigned int mem_unit, bitcount; |
1409 | unsigned long seq; | 1359 | struct timespec tp; |
1410 | 1360 | ||
1411 | memset(info, 0, sizeof(struct sysinfo)); | 1361 | memset(info, 0, sizeof(struct sysinfo)); |
1412 | 1362 | ||
1413 | do { | 1363 | ktime_get_ts(&tp); |
1414 | struct timespec tp; | 1364 | monotonic_to_bootbased(&tp); |
1415 | seq = read_seqbegin(&xtime_lock); | 1365 | info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); |
1416 | |||
1417 | /* | ||
1418 | * This is annoying. The below is the same thing | ||
1419 | * posix_get_clock_monotonic() does, but it wants to | ||
1420 | * take the lock which we want to cover the loads stuff | ||
1421 | * too. | ||
1422 | */ | ||
1423 | |||
1424 | getnstimeofday(&tp); | ||
1425 | tp.tv_sec += wall_to_monotonic.tv_sec; | ||
1426 | tp.tv_nsec += wall_to_monotonic.tv_nsec; | ||
1427 | monotonic_to_bootbased(&tp); | ||
1428 | if (tp.tv_nsec - NSEC_PER_SEC >= 0) { | ||
1429 | tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; | ||
1430 | tp.tv_sec++; | ||
1431 | } | ||
1432 | info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); | ||
1433 | 1366 | ||
1434 | info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); | 1367 | get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); |
1435 | info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); | ||
1436 | info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); | ||
1437 | 1368 | ||
1438 | info->procs = nr_threads; | 1369 | info->procs = nr_threads; |
1439 | } while (read_seqretry(&xtime_lock, seq)); | ||
1440 | 1370 | ||
1441 | si_meminfo(info); | 1371 | si_meminfo(info); |
1442 | si_swapinfo(info); | 1372 | si_swapinfo(info); |
diff --git a/kernel/wait.c b/kernel/wait.c index 42a2dbc181c8..ea7c3b4275cf 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, | |||
154 | if (!list_empty(&wait->task_list)) | 154 | if (!list_empty(&wait->task_list)) |
155 | list_del_init(&wait->task_list); | 155 | list_del_init(&wait->task_list); |
156 | else if (waitqueue_active(q)) | 156 | else if (waitqueue_active(q)) |
157 | __wake_up_common(q, mode, 1, 0, key); | 157 | __wake_up_locked_key(q, mode, key); |
158 | spin_unlock_irqrestore(&q->lock, flags); | 158 | spin_unlock_irqrestore(&q->lock, flags); |
159 | } | 159 | } |
160 | EXPORT_SYMBOL(abort_exclusive_wait); | 160 | EXPORT_SYMBOL(abort_exclusive_wait); |