diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-06-11 11:55:42 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-06-11 11:55:42 -0400 |
commit | 940010c5a314a7bd9b498593bc6ba1718ac5aec5 (patch) | |
tree | d141e08ced08c40c6a8e3ab2cdecde5ff14e560f /kernel/sched.c | |
parent | 8dc8e5e8bc0ce00b0f656bf972f67cd8a72759e5 (diff) | |
parent | 991ec02cdca33b03a132a0cacfe6f0aa0be9aa8d (diff) |
Merge branch 'linus' into perfcounters/core
Conflicts:
arch/x86/kernel/irqinit.c
arch/x86/kernel/irqinit_64.c
arch/x86/kernel/traps.c
arch/x86/mm/fault.c
include/linux/sched.h
kernel/exit.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 359 |
1 files changed, 315 insertions, 44 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 8d43347a0c0d..5b3f6ec1b0b3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -73,13 +73,15 @@ | |||
73 | #include <linux/debugfs.h> | 73 | #include <linux/debugfs.h> |
74 | #include <linux/ctype.h> | 74 | #include <linux/ctype.h> |
75 | #include <linux/ftrace.h> | 75 | #include <linux/ftrace.h> |
76 | #include <trace/sched.h> | ||
77 | 76 | ||
78 | #include <asm/tlb.h> | 77 | #include <asm/tlb.h> |
79 | #include <asm/irq_regs.h> | 78 | #include <asm/irq_regs.h> |
80 | 79 | ||
81 | #include "sched_cpupri.h" | 80 | #include "sched_cpupri.h" |
82 | 81 | ||
82 | #define CREATE_TRACE_POINTS | ||
83 | #include <trace/events/sched.h> | ||
84 | |||
83 | /* | 85 | /* |
84 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 86 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
85 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 87 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
@@ -119,12 +121,6 @@ | |||
119 | */ | 121 | */ |
120 | #define RUNTIME_INF ((u64)~0ULL) | 122 | #define RUNTIME_INF ((u64)~0ULL) |
121 | 123 | ||
122 | DEFINE_TRACE(sched_wait_task); | ||
123 | DEFINE_TRACE(sched_wakeup); | ||
124 | DEFINE_TRACE(sched_wakeup_new); | ||
125 | DEFINE_TRACE(sched_switch); | ||
126 | DEFINE_TRACE(sched_migrate_task); | ||
127 | |||
128 | #ifdef CONFIG_SMP | 124 | #ifdef CONFIG_SMP |
129 | 125 | ||
130 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | 126 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); |
@@ -632,6 +628,10 @@ struct rq { | |||
632 | struct list_head migration_queue; | 628 | struct list_head migration_queue; |
633 | #endif | 629 | #endif |
634 | 630 | ||
631 | /* calc_load related fields */ | ||
632 | unsigned long calc_load_update; | ||
633 | long calc_load_active; | ||
634 | |||
635 | #ifdef CONFIG_SCHED_HRTICK | 635 | #ifdef CONFIG_SCHED_HRTICK |
636 | #ifdef CONFIG_SMP | 636 | #ifdef CONFIG_SMP |
637 | int hrtick_csd_pending; | 637 | int hrtick_csd_pending; |
@@ -1730,6 +1730,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1730 | } | 1730 | } |
1731 | #endif | 1731 | #endif |
1732 | 1732 | ||
1733 | static void calc_load_account_active(struct rq *this_rq); | ||
1734 | |||
1733 | #include "sched_stats.h" | 1735 | #include "sched_stats.h" |
1734 | #include "sched_idletask.c" | 1736 | #include "sched_idletask.c" |
1735 | #include "sched_fair.c" | 1737 | #include "sched_fair.c" |
@@ -1960,7 +1962,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1960 | 1962 | ||
1961 | clock_offset = old_rq->clock - new_rq->clock; | 1963 | clock_offset = old_rq->clock - new_rq->clock; |
1962 | 1964 | ||
1963 | trace_sched_migrate_task(p, task_cpu(p), new_cpu); | 1965 | trace_sched_migrate_task(p, new_cpu); |
1964 | 1966 | ||
1965 | #ifdef CONFIG_SCHEDSTATS | 1967 | #ifdef CONFIG_SCHEDSTATS |
1966 | if (p->se.wait_start) | 1968 | if (p->se.wait_start) |
@@ -2021,6 +2023,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
2021 | } | 2023 | } |
2022 | 2024 | ||
2023 | /* | 2025 | /* |
2026 | * wait_task_context_switch - wait for a thread to complete at least one | ||
2027 | * context switch. | ||
2028 | * | ||
2029 | * @p must not be current. | ||
2030 | */ | ||
2031 | void wait_task_context_switch(struct task_struct *p) | ||
2032 | { | ||
2033 | unsigned long nvcsw, nivcsw, flags; | ||
2034 | int running; | ||
2035 | struct rq *rq; | ||
2036 | |||
2037 | nvcsw = p->nvcsw; | ||
2038 | nivcsw = p->nivcsw; | ||
2039 | for (;;) { | ||
2040 | /* | ||
2041 | * The runqueue is assigned before the actual context | ||
2042 | * switch. We need to take the runqueue lock. | ||
2043 | * | ||
2044 | * We could check initially without the lock but it is | ||
2045 | * very likely that we need to take the lock in every | ||
2046 | * iteration. | ||
2047 | */ | ||
2048 | rq = task_rq_lock(p, &flags); | ||
2049 | running = task_running(rq, p); | ||
2050 | task_rq_unlock(rq, &flags); | ||
2051 | |||
2052 | if (likely(!running)) | ||
2053 | break; | ||
2054 | /* | ||
2055 | * The switch count is incremented before the actual | ||
2056 | * context switch. We thus wait for two switches to be | ||
2057 | * sure at least one completed. | ||
2058 | */ | ||
2059 | if ((p->nvcsw - nvcsw) > 1) | ||
2060 | break; | ||
2061 | if ((p->nivcsw - nivcsw) > 1) | ||
2062 | break; | ||
2063 | |||
2064 | cpu_relax(); | ||
2065 | } | ||
2066 | } | ||
2067 | |||
2068 | /* | ||
2024 | * wait_task_inactive - wait for a thread to unschedule. | 2069 | * wait_task_inactive - wait for a thread to unschedule. |
2025 | * | 2070 | * |
2026 | * If @match_state is nonzero, it's the @p->state value just checked and | 2071 | * If @match_state is nonzero, it's the @p->state value just checked and |
@@ -2485,6 +2530,17 @@ out: | |||
2485 | return success; | 2530 | return success; |
2486 | } | 2531 | } |
2487 | 2532 | ||
2533 | /** | ||
2534 | * wake_up_process - Wake up a specific process | ||
2535 | * @p: The process to be woken up. | ||
2536 | * | ||
2537 | * Attempt to wake up the nominated process and move it to the set of runnable | ||
2538 | * processes. Returns 1 if the process was woken up, 0 if it was already | ||
2539 | * running. | ||
2540 | * | ||
2541 | * It may be assumed that this function implies a write memory barrier before | ||
2542 | * changing the task state if and only if any tasks are woken up. | ||
2543 | */ | ||
2488 | int wake_up_process(struct task_struct *p) | 2544 | int wake_up_process(struct task_struct *p) |
2489 | { | 2545 | { |
2490 | return try_to_wake_up(p, TASK_ALL, 0); | 2546 | return try_to_wake_up(p, TASK_ALL, 0); |
@@ -2795,7 +2851,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2795 | * combine the page table reload and the switch backend into | 2851 | * combine the page table reload and the switch backend into |
2796 | * one hypercall. | 2852 | * one hypercall. |
2797 | */ | 2853 | */ |
2798 | arch_enter_lazy_cpu_mode(); | 2854 | arch_start_context_switch(prev); |
2799 | 2855 | ||
2800 | if (unlikely(!mm)) { | 2856 | if (unlikely(!mm)) { |
2801 | next->active_mm = oldmm; | 2857 | next->active_mm = oldmm; |
@@ -2885,19 +2941,72 @@ unsigned long nr_iowait(void) | |||
2885 | return sum; | 2941 | return sum; |
2886 | } | 2942 | } |
2887 | 2943 | ||
2888 | unsigned long nr_active(void) | 2944 | /* Variables and functions for calc_load */ |
2945 | static atomic_long_t calc_load_tasks; | ||
2946 | static unsigned long calc_load_update; | ||
2947 | unsigned long avenrun[3]; | ||
2948 | EXPORT_SYMBOL(avenrun); | ||
2949 | |||
2950 | /** | ||
2951 | * get_avenrun - get the load average array | ||
2952 | * @loads: pointer to dest load array | ||
2953 | * @offset: offset to add | ||
2954 | * @shift: shift count to shift the result left | ||
2955 | * | ||
2956 | * These values are estimates at best, so no need for locking. | ||
2957 | */ | ||
2958 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2959 | { | ||
2960 | loads[0] = (avenrun[0] + offset) << shift; | ||
2961 | loads[1] = (avenrun[1] + offset) << shift; | ||
2962 | loads[2] = (avenrun[2] + offset) << shift; | ||
2963 | } | ||
2964 | |||
2965 | static unsigned long | ||
2966 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
2889 | { | 2967 | { |
2890 | unsigned long i, running = 0, uninterruptible = 0; | 2968 | load *= exp; |
2969 | load += active * (FIXED_1 - exp); | ||
2970 | return load >> FSHIFT; | ||
2971 | } | ||
2891 | 2972 | ||
2892 | for_each_online_cpu(i) { | 2973 | /* |
2893 | running += cpu_rq(i)->nr_running; | 2974 | * calc_load - update the avenrun load estimates 10 ticks after the |
2894 | uninterruptible += cpu_rq(i)->nr_uninterruptible; | 2975 | * CPUs have updated calc_load_tasks. |
2895 | } | 2976 | */ |
2977 | void calc_global_load(void) | ||
2978 | { | ||
2979 | unsigned long upd = calc_load_update + 10; | ||
2980 | long active; | ||
2896 | 2981 | ||
2897 | if (unlikely((long)uninterruptible < 0)) | 2982 | if (time_before(jiffies, upd)) |
2898 | uninterruptible = 0; | 2983 | return; |
2899 | 2984 | ||
2900 | return running + uninterruptible; | 2985 | active = atomic_long_read(&calc_load_tasks); |
2986 | active = active > 0 ? active * FIXED_1 : 0; | ||
2987 | |||
2988 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | ||
2989 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
2990 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
2991 | |||
2992 | calc_load_update += LOAD_FREQ; | ||
2993 | } | ||
2994 | |||
2995 | /* | ||
2996 | * Either called from update_cpu_load() or from a cpu going idle | ||
2997 | */ | ||
2998 | static void calc_load_account_active(struct rq *this_rq) | ||
2999 | { | ||
3000 | long nr_active, delta; | ||
3001 | |||
3002 | nr_active = this_rq->nr_running; | ||
3003 | nr_active += (long) this_rq->nr_uninterruptible; | ||
3004 | |||
3005 | if (nr_active != this_rq->calc_load_active) { | ||
3006 | delta = nr_active - this_rq->calc_load_active; | ||
3007 | this_rq->calc_load_active = nr_active; | ||
3008 | atomic_long_add(delta, &calc_load_tasks); | ||
3009 | } | ||
2901 | } | 3010 | } |
2902 | 3011 | ||
2903 | /* | 3012 | /* |
@@ -2937,6 +3046,11 @@ static void update_cpu_load(struct rq *this_rq) | |||
2937 | new_load += scale-1; | 3046 | new_load += scale-1; |
2938 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 3047 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2939 | } | 3048 | } |
3049 | |||
3050 | if (time_after_eq(jiffies, this_rq->calc_load_update)) { | ||
3051 | this_rq->calc_load_update += LOAD_FREQ; | ||
3052 | calc_load_account_active(this_rq); | ||
3053 | } | ||
2940 | } | 3054 | } |
2941 | 3055 | ||
2942 | #ifdef CONFIG_SMP | 3056 | #ifdef CONFIG_SMP |
@@ -4278,10 +4392,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
4278 | static struct { | 4392 | static struct { |
4279 | atomic_t load_balancer; | 4393 | atomic_t load_balancer; |
4280 | cpumask_var_t cpu_mask; | 4394 | cpumask_var_t cpu_mask; |
4395 | cpumask_var_t ilb_grp_nohz_mask; | ||
4281 | } nohz ____cacheline_aligned = { | 4396 | } nohz ____cacheline_aligned = { |
4282 | .load_balancer = ATOMIC_INIT(-1), | 4397 | .load_balancer = ATOMIC_INIT(-1), |
4283 | }; | 4398 | }; |
4284 | 4399 | ||
4400 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
4401 | /** | ||
4402 | * lowest_flag_domain - Return lowest sched_domain containing flag. | ||
4403 | * @cpu: The cpu whose lowest level of sched domain is to | ||
4404 | * be returned. | ||
4405 | * @flag: The flag to check for the lowest sched_domain | ||
4406 | * for the given cpu. | ||
4407 | * | ||
4408 | * Returns the lowest sched_domain of a cpu which contains the given flag. | ||
4409 | */ | ||
4410 | static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) | ||
4411 | { | ||
4412 | struct sched_domain *sd; | ||
4413 | |||
4414 | for_each_domain(cpu, sd) | ||
4415 | if (sd && (sd->flags & flag)) | ||
4416 | break; | ||
4417 | |||
4418 | return sd; | ||
4419 | } | ||
4420 | |||
4421 | /** | ||
4422 | * for_each_flag_domain - Iterates over sched_domains containing the flag. | ||
4423 | * @cpu: The cpu whose domains we're iterating over. | ||
4424 | * @sd: variable holding the value of the power_savings_sd | ||
4425 | * for cpu. | ||
4426 | * @flag: The flag to filter the sched_domains to be iterated. | ||
4427 | * | ||
4428 | * Iterates over all the scheduler domains for a given cpu that has the 'flag' | ||
4429 | * set, starting from the lowest sched_domain to the highest. | ||
4430 | */ | ||
4431 | #define for_each_flag_domain(cpu, sd, flag) \ | ||
4432 | for (sd = lowest_flag_domain(cpu, flag); \ | ||
4433 | (sd && (sd->flags & flag)); sd = sd->parent) | ||
4434 | |||
4435 | /** | ||
4436 | * is_semi_idle_group - Checks if the given sched_group is semi-idle. | ||
4437 | * @ilb_group: group to be checked for semi-idleness | ||
4438 | * | ||
4439 | * Returns: 1 if the group is semi-idle. 0 otherwise. | ||
4440 | * | ||
4441 | * We define a sched_group to be semi idle if it has atleast one idle-CPU | ||
4442 | * and atleast one non-idle CPU. This helper function checks if the given | ||
4443 | * sched_group is semi-idle or not. | ||
4444 | */ | ||
4445 | static inline int is_semi_idle_group(struct sched_group *ilb_group) | ||
4446 | { | ||
4447 | cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, | ||
4448 | sched_group_cpus(ilb_group)); | ||
4449 | |||
4450 | /* | ||
4451 | * A sched_group is semi-idle when it has atleast one busy cpu | ||
4452 | * and atleast one idle cpu. | ||
4453 | */ | ||
4454 | if (cpumask_empty(nohz.ilb_grp_nohz_mask)) | ||
4455 | return 0; | ||
4456 | |||
4457 | if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) | ||
4458 | return 0; | ||
4459 | |||
4460 | return 1; | ||
4461 | } | ||
4462 | /** | ||
4463 | * find_new_ilb - Finds the optimum idle load balancer for nomination. | ||
4464 | * @cpu: The cpu which is nominating a new idle_load_balancer. | ||
4465 | * | ||
4466 | * Returns: Returns the id of the idle load balancer if it exists, | ||
4467 | * Else, returns >= nr_cpu_ids. | ||
4468 | * | ||
4469 | * This algorithm picks the idle load balancer such that it belongs to a | ||
4470 | * semi-idle powersavings sched_domain. The idea is to try and avoid | ||
4471 | * completely idle packages/cores just for the purpose of idle load balancing | ||
4472 | * when there are other idle cpu's which are better suited for that job. | ||
4473 | */ | ||
4474 | static int find_new_ilb(int cpu) | ||
4475 | { | ||
4476 | struct sched_domain *sd; | ||
4477 | struct sched_group *ilb_group; | ||
4478 | |||
4479 | /* | ||
4480 | * Have idle load balancer selection from semi-idle packages only | ||
4481 | * when power-aware load balancing is enabled | ||
4482 | */ | ||
4483 | if (!(sched_smt_power_savings || sched_mc_power_savings)) | ||
4484 | goto out_done; | ||
4485 | |||
4486 | /* | ||
4487 | * Optimize for the case when we have no idle CPUs or only one | ||
4488 | * idle CPU. Don't walk the sched_domain hierarchy in such cases | ||
4489 | */ | ||
4490 | if (cpumask_weight(nohz.cpu_mask) < 2) | ||
4491 | goto out_done; | ||
4492 | |||
4493 | for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { | ||
4494 | ilb_group = sd->groups; | ||
4495 | |||
4496 | do { | ||
4497 | if (is_semi_idle_group(ilb_group)) | ||
4498 | return cpumask_first(nohz.ilb_grp_nohz_mask); | ||
4499 | |||
4500 | ilb_group = ilb_group->next; | ||
4501 | |||
4502 | } while (ilb_group != sd->groups); | ||
4503 | } | ||
4504 | |||
4505 | out_done: | ||
4506 | return cpumask_first(nohz.cpu_mask); | ||
4507 | } | ||
4508 | #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ | ||
4509 | static inline int find_new_ilb(int call_cpu) | ||
4510 | { | ||
4511 | return cpumask_first(nohz.cpu_mask); | ||
4512 | } | ||
4513 | #endif | ||
4514 | |||
4285 | /* | 4515 | /* |
4286 | * This routine will try to nominate the ilb (idle load balancing) | 4516 | * This routine will try to nominate the ilb (idle load balancing) |
4287 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 4517 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
@@ -4336,8 +4566,24 @@ int select_nohz_load_balancer(int stop_tick) | |||
4336 | /* make me the ilb owner */ | 4566 | /* make me the ilb owner */ |
4337 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | 4567 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) |
4338 | return 1; | 4568 | return 1; |
4339 | } else if (atomic_read(&nohz.load_balancer) == cpu) | 4569 | } else if (atomic_read(&nohz.load_balancer) == cpu) { |
4570 | int new_ilb; | ||
4571 | |||
4572 | if (!(sched_smt_power_savings || | ||
4573 | sched_mc_power_savings)) | ||
4574 | return 1; | ||
4575 | /* | ||
4576 | * Check to see if there is a more power-efficient | ||
4577 | * ilb. | ||
4578 | */ | ||
4579 | new_ilb = find_new_ilb(cpu); | ||
4580 | if (new_ilb < nr_cpu_ids && new_ilb != cpu) { | ||
4581 | atomic_set(&nohz.load_balancer, -1); | ||
4582 | resched_cpu(new_ilb); | ||
4583 | return 0; | ||
4584 | } | ||
4340 | return 1; | 4585 | return 1; |
4586 | } | ||
4341 | } else { | 4587 | } else { |
4342 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) | 4588 | if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) |
4343 | return 0; | 4589 | return 0; |
@@ -4506,15 +4752,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) | |||
4506 | } | 4752 | } |
4507 | 4753 | ||
4508 | if (atomic_read(&nohz.load_balancer) == -1) { | 4754 | if (atomic_read(&nohz.load_balancer) == -1) { |
4509 | /* | 4755 | int ilb = find_new_ilb(cpu); |
4510 | * simple selection for now: Nominate the | ||
4511 | * first cpu in the nohz list to be the next | ||
4512 | * ilb owner. | ||
4513 | * | ||
4514 | * TBD: Traverse the sched domains and nominate | ||
4515 | * the nearest cpu in the nohz.cpu_mask. | ||
4516 | */ | ||
4517 | int ilb = cpumask_first(nohz.cpu_mask); | ||
4518 | 4756 | ||
4519 | if (ilb < nr_cpu_ids) | 4757 | if (ilb < nr_cpu_ids) |
4520 | resched_cpu(ilb); | 4758 | resched_cpu(ilb); |
@@ -5047,13 +5285,15 @@ pick_next_task(struct rq *rq) | |||
5047 | /* | 5285 | /* |
5048 | * schedule() is the main scheduler function. | 5286 | * schedule() is the main scheduler function. |
5049 | */ | 5287 | */ |
5050 | asmlinkage void __sched __schedule(void) | 5288 | asmlinkage void __sched schedule(void) |
5051 | { | 5289 | { |
5052 | struct task_struct *prev, *next; | 5290 | struct task_struct *prev, *next; |
5053 | unsigned long *switch_count; | 5291 | unsigned long *switch_count; |
5054 | struct rq *rq; | 5292 | struct rq *rq; |
5055 | int cpu; | 5293 | int cpu; |
5056 | 5294 | ||
5295 | need_resched: | ||
5296 | preempt_disable(); | ||
5057 | cpu = smp_processor_id(); | 5297 | cpu = smp_processor_id(); |
5058 | rq = cpu_rq(cpu); | 5298 | rq = cpu_rq(cpu); |
5059 | rcu_qsctr_inc(cpu); | 5299 | rcu_qsctr_inc(cpu); |
@@ -5111,15 +5351,9 @@ need_resched_nonpreemptible: | |||
5111 | 5351 | ||
5112 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5352 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
5113 | goto need_resched_nonpreemptible; | 5353 | goto need_resched_nonpreemptible; |
5114 | } | ||
5115 | 5354 | ||
5116 | asmlinkage void __sched schedule(void) | ||
5117 | { | ||
5118 | need_resched: | ||
5119 | preempt_disable(); | ||
5120 | __schedule(); | ||
5121 | preempt_enable_no_resched(); | 5355 | preempt_enable_no_resched(); |
5122 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 5356 | if (need_resched()) |
5123 | goto need_resched; | 5357 | goto need_resched; |
5124 | } | 5358 | } |
5125 | EXPORT_SYMBOL(schedule); | 5359 | EXPORT_SYMBOL(schedule); |
@@ -5262,7 +5496,7 @@ EXPORT_SYMBOL(default_wake_function); | |||
5262 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 5496 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
5263 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 5497 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
5264 | */ | 5498 | */ |
5265 | void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 5499 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
5266 | int nr_exclusive, int sync, void *key) | 5500 | int nr_exclusive, int sync, void *key) |
5267 | { | 5501 | { |
5268 | wait_queue_t *curr, *next; | 5502 | wait_queue_t *curr, *next; |
@@ -5282,6 +5516,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
5282 | * @mode: which threads | 5516 | * @mode: which threads |
5283 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 5517 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
5284 | * @key: is directly passed to the wakeup function | 5518 | * @key: is directly passed to the wakeup function |
5519 | * | ||
5520 | * It may be assumed that this function implies a write memory barrier before | ||
5521 | * changing the task state if and only if any tasks are woken up. | ||
5285 | */ | 5522 | */ |
5286 | void __wake_up(wait_queue_head_t *q, unsigned int mode, | 5523 | void __wake_up(wait_queue_head_t *q, unsigned int mode, |
5287 | int nr_exclusive, void *key) | 5524 | int nr_exclusive, void *key) |
@@ -5320,6 +5557,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | |||
5320 | * with each other. This can prevent needless bouncing between CPUs. | 5557 | * with each other. This can prevent needless bouncing between CPUs. |
5321 | * | 5558 | * |
5322 | * On UP it can prevent extra preemption. | 5559 | * On UP it can prevent extra preemption. |
5560 | * | ||
5561 | * It may be assumed that this function implies a write memory barrier before | ||
5562 | * changing the task state if and only if any tasks are woken up. | ||
5323 | */ | 5563 | */ |
5324 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | 5564 | void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, |
5325 | int nr_exclusive, void *key) | 5565 | int nr_exclusive, void *key) |
@@ -5356,6 +5596,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | |||
5356 | * awakened in the same order in which they were queued. | 5596 | * awakened in the same order in which they were queued. |
5357 | * | 5597 | * |
5358 | * See also complete_all(), wait_for_completion() and related routines. | 5598 | * See also complete_all(), wait_for_completion() and related routines. |
5599 | * | ||
5600 | * It may be assumed that this function implies a write memory barrier before | ||
5601 | * changing the task state if and only if any tasks are woken up. | ||
5359 | */ | 5602 | */ |
5360 | void complete(struct completion *x) | 5603 | void complete(struct completion *x) |
5361 | { | 5604 | { |
@@ -5373,6 +5616,9 @@ EXPORT_SYMBOL(complete); | |||
5373 | * @x: holds the state of this particular completion | 5616 | * @x: holds the state of this particular completion |
5374 | * | 5617 | * |
5375 | * This will wake up all threads waiting on this particular completion event. | 5618 | * This will wake up all threads waiting on this particular completion event. |
5619 | * | ||
5620 | * It may be assumed that this function implies a write memory barrier before | ||
5621 | * changing the task state if and only if any tasks are woken up. | ||
5376 | */ | 5622 | */ |
5377 | void complete_all(struct completion *x) | 5623 | void complete_all(struct completion *x) |
5378 | { | 5624 | { |
@@ -6531,8 +6777,9 @@ void sched_show_task(struct task_struct *p) | |||
6531 | #ifdef CONFIG_DEBUG_STACK_USAGE | 6777 | #ifdef CONFIG_DEBUG_STACK_USAGE |
6532 | free = stack_not_used(p); | 6778 | free = stack_not_used(p); |
6533 | #endif | 6779 | #endif |
6534 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 6780 | printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, |
6535 | task_pid_nr(p), task_pid_nr(p->real_parent)); | 6781 | task_pid_nr(p), task_pid_nr(p->real_parent), |
6782 | (unsigned long)task_thread_info(p)->flags); | ||
6536 | 6783 | ||
6537 | show_stack(p, NULL); | 6784 | show_stack(p, NULL); |
6538 | } | 6785 | } |
@@ -7011,6 +7258,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
7011 | 7258 | ||
7012 | } | 7259 | } |
7013 | } | 7260 | } |
7261 | |||
7262 | /* | ||
7263 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
7264 | */ | ||
7265 | static void calc_global_load_remove(struct rq *rq) | ||
7266 | { | ||
7267 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
7268 | } | ||
7014 | #endif /* CONFIG_HOTPLUG_CPU */ | 7269 | #endif /* CONFIG_HOTPLUG_CPU */ |
7015 | 7270 | ||
7016 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 7271 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -7245,6 +7500,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7245 | /* Update our root-domain */ | 7500 | /* Update our root-domain */ |
7246 | rq = cpu_rq(cpu); | 7501 | rq = cpu_rq(cpu); |
7247 | spin_lock_irqsave(&rq->lock, flags); | 7502 | spin_lock_irqsave(&rq->lock, flags); |
7503 | rq->calc_load_update = calc_load_update; | ||
7504 | rq->calc_load_active = 0; | ||
7248 | if (rq->rd) { | 7505 | if (rq->rd) { |
7249 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 7506 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
7250 | 7507 | ||
@@ -7284,7 +7541,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7284 | cpuset_unlock(); | 7541 | cpuset_unlock(); |
7285 | migrate_nr_uninterruptible(rq); | 7542 | migrate_nr_uninterruptible(rq); |
7286 | BUG_ON(rq->nr_running != 0); | 7543 | BUG_ON(rq->nr_running != 0); |
7287 | 7544 | calc_global_load_remove(rq); | |
7288 | /* | 7545 | /* |
7289 | * No need to migrate the tasks: it was best-effort if | 7546 | * No need to migrate the tasks: it was best-effort if |
7290 | * they didn't take sched_hotcpu_mutex. Just wake up | 7547 | * they didn't take sched_hotcpu_mutex. Just wake up |
@@ -7796,8 +8053,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | |||
7796 | 8053 | ||
7797 | /* | 8054 | /* |
7798 | * The cpus mask in sched_group and sched_domain hangs off the end. | 8055 | * The cpus mask in sched_group and sched_domain hangs off the end. |
7799 | * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space | 8056 | * |
7800 | * for nr_cpu_ids < CONFIG_NR_CPUS. | 8057 | * ( See the the comments in include/linux/sched.h:struct sched_group |
8058 | * and struct sched_domain. ) | ||
7801 | */ | 8059 | */ |
7802 | struct static_sched_group { | 8060 | struct static_sched_group { |
7803 | struct sched_group sg; | 8061 | struct sched_group sg; |
@@ -7918,7 +8176,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
7918 | struct sched_domain *sd; | 8176 | struct sched_domain *sd; |
7919 | 8177 | ||
7920 | sd = &per_cpu(phys_domains, j).sd; | 8178 | sd = &per_cpu(phys_domains, j).sd; |
7921 | if (j != cpumask_first(sched_group_cpus(sd->groups))) { | 8179 | if (j != group_first_cpu(sd->groups)) { |
7922 | /* | 8180 | /* |
7923 | * Only add "power" once for each | 8181 | * Only add "power" once for each |
7924 | * physical package. | 8182 | * physical package. |
@@ -7996,7 +8254,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
7996 | 8254 | ||
7997 | WARN_ON(!sd || !sd->groups); | 8255 | WARN_ON(!sd || !sd->groups); |
7998 | 8256 | ||
7999 | if (cpu != cpumask_first(sched_group_cpus(sd->groups))) | 8257 | if (cpu != group_first_cpu(sd->groups)) |
8000 | return; | 8258 | return; |
8001 | 8259 | ||
8002 | child = sd->child; | 8260 | child = sd->child; |
@@ -8981,6 +9239,8 @@ void __init sched_init(void) | |||
8981 | rq = cpu_rq(i); | 9239 | rq = cpu_rq(i); |
8982 | spin_lock_init(&rq->lock); | 9240 | spin_lock_init(&rq->lock); |
8983 | rq->nr_running = 0; | 9241 | rq->nr_running = 0; |
9242 | rq->calc_load_active = 0; | ||
9243 | rq->calc_load_update = jiffies + LOAD_FREQ; | ||
8984 | init_cfs_rq(&rq->cfs, rq); | 9244 | init_cfs_rq(&rq->cfs, rq); |
8985 | init_rt_rq(&rq->rt, rq); | 9245 | init_rt_rq(&rq->rt, rq); |
8986 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9246 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -9088,6 +9348,9 @@ void __init sched_init(void) | |||
9088 | * when this runqueue becomes "idle". | 9348 | * when this runqueue becomes "idle". |
9089 | */ | 9349 | */ |
9090 | init_idle(current, smp_processor_id()); | 9350 | init_idle(current, smp_processor_id()); |
9351 | |||
9352 | calc_load_update = jiffies + LOAD_FREQ; | ||
9353 | |||
9091 | /* | 9354 | /* |
9092 | * During early bootup we pretend to be a normal task: | 9355 | * During early bootup we pretend to be a normal task: |
9093 | */ | 9356 | */ |
@@ -9098,6 +9361,7 @@ void __init sched_init(void) | |||
9098 | #ifdef CONFIG_SMP | 9361 | #ifdef CONFIG_SMP |
9099 | #ifdef CONFIG_NO_HZ | 9362 | #ifdef CONFIG_NO_HZ |
9100 | alloc_bootmem_cpumask_var(&nohz.cpu_mask); | 9363 | alloc_bootmem_cpumask_var(&nohz.cpu_mask); |
9364 | alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); | ||
9101 | #endif | 9365 | #endif |
9102 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | 9366 | alloc_bootmem_cpumask_var(&cpu_isolated_map); |
9103 | #endif /* SMP */ | 9367 | #endif /* SMP */ |
@@ -9845,6 +10109,13 @@ static int sched_rt_global_constraints(void) | |||
9845 | if (sysctl_sched_rt_period <= 0) | 10109 | if (sysctl_sched_rt_period <= 0) |
9846 | return -EINVAL; | 10110 | return -EINVAL; |
9847 | 10111 | ||
10112 | /* | ||
10113 | * There's always some RT tasks in the root group | ||
10114 | * -- migration, kstopmachine etc.. | ||
10115 | */ | ||
10116 | if (sysctl_sched_rt_runtime == 0) | ||
10117 | return -EBUSY; | ||
10118 | |||
9848 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); | 10119 | spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); |
9849 | for_each_possible_cpu(i) { | 10120 | for_each_possible_cpu(i) { |
9850 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; | 10121 | struct rt_rq *rt_rq = &cpu_rq(i)->rt; |