aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c447
1 files changed, 380 insertions, 67 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 9fe3774a0fd3..8fb88a906aaa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h>
42#include <linux/security.h> 43#include <linux/security.h>
43#include <linux/notifier.h> 44#include <linux/notifier.h>
44#include <linux/profile.h> 45#include <linux/profile.h>
@@ -68,17 +69,18 @@
68#include <linux/pagemap.h> 69#include <linux/pagemap.h>
69#include <linux/hrtimer.h> 70#include <linux/hrtimer.h>
70#include <linux/tick.h> 71#include <linux/tick.h>
71#include <linux/bootmem.h>
72#include <linux/debugfs.h> 72#include <linux/debugfs.h>
73#include <linux/ctype.h> 73#include <linux/ctype.h>
74#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
76 75
77#include <asm/tlb.h> 76#include <asm/tlb.h>
78#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
79 78
80#include "sched_cpupri.h" 79#include "sched_cpupri.h"
81 80
81#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h>
83
82/* 84/*
83 * Convert user-nice values [ -20 ... 0 ... 19 ] 85 * Convert user-nice values [ -20 ... 0 ... 19 ]
84 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 86 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
118 */ 120 */
119#define RUNTIME_INF ((u64)~0ULL) 121#define RUNTIME_INF ((u64)~0ULL)
120 122
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
127#ifdef CONFIG_SMP 123#ifdef CONFIG_SMP
128 124
129static void double_rq_lock(struct rq *rq1, struct rq *rq2); 125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -584,6 +580,7 @@ struct rq {
584 struct load_weight load; 580 struct load_weight load;
585 unsigned long nr_load_updates; 581 unsigned long nr_load_updates;
586 u64 nr_switches; 582 u64 nr_switches;
583 u64 nr_migrations_in;
587 584
588 struct cfs_rq cfs; 585 struct cfs_rq cfs;
589 struct rt_rq rt; 586 struct rt_rq rt;
@@ -630,6 +627,10 @@ struct rq {
630 struct list_head migration_queue; 627 struct list_head migration_queue;
631#endif 628#endif
632 629
630 /* calc_load related fields */
631 unsigned long calc_load_update;
632 long calc_load_active;
633
633#ifdef CONFIG_SCHED_HRTICK 634#ifdef CONFIG_SCHED_HRTICK
634#ifdef CONFIG_SMP 635#ifdef CONFIG_SMP
635 int hrtick_csd_pending; 636 int hrtick_csd_pending;
@@ -692,7 +693,7 @@ static inline int cpu_of(struct rq *rq)
692#define task_rq(p) cpu_rq(task_cpu(p)) 693#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 694#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694 695
695static inline void update_rq_clock(struct rq *rq) 696inline void update_rq_clock(struct rq *rq)
696{ 697{
697 rq->clock = sched_clock_cpu(cpu_of(rq)); 698 rq->clock = sched_clock_cpu(cpu_of(rq));
698} 699}
@@ -1728,6 +1729,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728} 1729}
1729#endif 1730#endif
1730 1731
1732static void calc_load_account_active(struct rq *this_rq);
1733
1731#include "sched_stats.h" 1734#include "sched_stats.h"
1732#include "sched_idletask.c" 1735#include "sched_idletask.c"
1733#include "sched_fair.c" 1736#include "sched_fair.c"
@@ -1958,7 +1961,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1958 1961
1959 clock_offset = old_rq->clock - new_rq->clock; 1962 clock_offset = old_rq->clock - new_rq->clock;
1960 1963
1961 trace_sched_migrate_task(p, task_cpu(p), new_cpu); 1964 trace_sched_migrate_task(p, new_cpu);
1962 1965
1963#ifdef CONFIG_SCHEDSTATS 1966#ifdef CONFIG_SCHEDSTATS
1964 if (p->se.wait_start) 1967 if (p->se.wait_start)
@@ -1967,12 +1970,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1967 p->se.sleep_start -= clock_offset; 1970 p->se.sleep_start -= clock_offset;
1968 if (p->se.block_start) 1971 if (p->se.block_start)
1969 p->se.block_start -= clock_offset; 1972 p->se.block_start -= clock_offset;
1973#endif
1970 if (old_cpu != new_cpu) { 1974 if (old_cpu != new_cpu) {
1971 schedstat_inc(p, se.nr_migrations); 1975 p->se.nr_migrations++;
1976 new_rq->nr_migrations_in++;
1977#ifdef CONFIG_SCHEDSTATS
1972 if (task_hot(p, old_rq->clock, NULL)) 1978 if (task_hot(p, old_rq->clock, NULL))
1973 schedstat_inc(p, se.nr_forced2_migrations); 1979 schedstat_inc(p, se.nr_forced2_migrations);
1974 }
1975#endif 1980#endif
1981 perf_counter_task_migration(p, new_cpu);
1982 }
1976 p->se.vruntime -= old_cfsrq->min_vruntime - 1983 p->se.vruntime -= old_cfsrq->min_vruntime -
1977 new_cfsrq->min_vruntime; 1984 new_cfsrq->min_vruntime;
1978 1985
@@ -2015,6 +2022,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2015} 2022}
2016 2023
2017/* 2024/*
2025 * wait_task_context_switch - wait for a thread to complete at least one
2026 * context switch.
2027 *
2028 * @p must not be current.
2029 */
2030void wait_task_context_switch(struct task_struct *p)
2031{
2032 unsigned long nvcsw, nivcsw, flags;
2033 int running;
2034 struct rq *rq;
2035
2036 nvcsw = p->nvcsw;
2037 nivcsw = p->nivcsw;
2038 for (;;) {
2039 /*
2040 * The runqueue is assigned before the actual context
2041 * switch. We need to take the runqueue lock.
2042 *
2043 * We could check initially without the lock but it is
2044 * very likely that we need to take the lock in every
2045 * iteration.
2046 */
2047 rq = task_rq_lock(p, &flags);
2048 running = task_running(rq, p);
2049 task_rq_unlock(rq, &flags);
2050
2051 if (likely(!running))
2052 break;
2053 /*
2054 * The switch count is incremented before the actual
2055 * context switch. We thus wait for two switches to be
2056 * sure at least one completed.
2057 */
2058 if ((p->nvcsw - nvcsw) > 1)
2059 break;
2060 if ((p->nivcsw - nivcsw) > 1)
2061 break;
2062
2063 cpu_relax();
2064 }
2065}
2066
2067/*
2018 * wait_task_inactive - wait for a thread to unschedule. 2068 * wait_task_inactive - wait for a thread to unschedule.
2019 * 2069 *
2020 * If @match_state is nonzero, it's the @p->state value just checked and 2070 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2142,6 +2192,7 @@ void kick_process(struct task_struct *p)
2142 smp_send_reschedule(cpu); 2192 smp_send_reschedule(cpu);
2143 preempt_enable(); 2193 preempt_enable();
2144} 2194}
2195EXPORT_SYMBOL_GPL(kick_process);
2145 2196
2146/* 2197/*
2147 * Return a low guess at the load of a migration-source cpu weighted 2198 * Return a low guess at the load of a migration-source cpu weighted
@@ -2324,6 +2375,27 @@ static int sched_balance_self(int cpu, int flag)
2324 2375
2325#endif /* CONFIG_SMP */ 2376#endif /* CONFIG_SMP */
2326 2377
2378/**
2379 * task_oncpu_function_call - call a function on the cpu on which a task runs
2380 * @p: the task to evaluate
2381 * @func: the function to be called
2382 * @info: the function call argument
2383 *
2384 * Calls the function @func when the task is currently running. This might
2385 * be on the current CPU, which just calls the function directly
2386 */
2387void task_oncpu_function_call(struct task_struct *p,
2388 void (*func) (void *info), void *info)
2389{
2390 int cpu;
2391
2392 preempt_disable();
2393 cpu = task_cpu(p);
2394 if (task_curr(p))
2395 smp_call_function_single(cpu, func, info, 1);
2396 preempt_enable();
2397}
2398
2327/*** 2399/***
2328 * try_to_wake_up - wake up a thread 2400 * try_to_wake_up - wake up a thread
2329 * @p: the to-be-woken-up thread 2401 * @p: the to-be-woken-up thread
@@ -2458,6 +2530,17 @@ out:
2458 return success; 2530 return success;
2459} 2531}
2460 2532
2533/**
2534 * wake_up_process - Wake up a specific process
2535 * @p: The process to be woken up.
2536 *
2537 * Attempt to wake up the nominated process and move it to the set of runnable
2538 * processes. Returns 1 if the process was woken up, 0 if it was already
2539 * running.
2540 *
2541 * It may be assumed that this function implies a write memory barrier before
2542 * changing the task state if and only if any tasks are woken up.
2543 */
2461int wake_up_process(struct task_struct *p) 2544int wake_up_process(struct task_struct *p)
2462{ 2545{
2463 return try_to_wake_up(p, TASK_ALL, 0); 2546 return try_to_wake_up(p, TASK_ALL, 0);
@@ -2480,6 +2563,7 @@ static void __sched_fork(struct task_struct *p)
2480 p->se.exec_start = 0; 2563 p->se.exec_start = 0;
2481 p->se.sum_exec_runtime = 0; 2564 p->se.sum_exec_runtime = 0;
2482 p->se.prev_sum_exec_runtime = 0; 2565 p->se.prev_sum_exec_runtime = 0;
2566 p->se.nr_migrations = 0;
2483 p->se.last_wakeup = 0; 2567 p->se.last_wakeup = 0;
2484 p->se.avg_overlap = 0; 2568 p->se.avg_overlap = 0;
2485 p->se.start_runtime = 0; 2569 p->se.start_runtime = 0;
@@ -2710,6 +2794,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2710 */ 2794 */
2711 prev_state = prev->state; 2795 prev_state = prev->state;
2712 finish_arch_switch(prev); 2796 finish_arch_switch(prev);
2797 perf_counter_task_sched_in(current, cpu_of(rq));
2713 finish_lock_switch(rq, prev); 2798 finish_lock_switch(rq, prev);
2714#ifdef CONFIG_SMP 2799#ifdef CONFIG_SMP
2715 if (post_schedule) 2800 if (post_schedule)
@@ -2766,7 +2851,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2766 * combine the page table reload and the switch backend into 2851 * combine the page table reload and the switch backend into
2767 * one hypercall. 2852 * one hypercall.
2768 */ 2853 */
2769 arch_enter_lazy_cpu_mode(); 2854 arch_start_context_switch(prev);
2770 2855
2771 if (unlikely(!mm)) { 2856 if (unlikely(!mm)) {
2772 next->active_mm = oldmm; 2857 next->active_mm = oldmm;
@@ -2856,19 +2941,81 @@ unsigned long nr_iowait(void)
2856 return sum; 2941 return sum;
2857} 2942}
2858 2943
2859unsigned long nr_active(void) 2944/* Variables and functions for calc_load */
2945static atomic_long_t calc_load_tasks;
2946static unsigned long calc_load_update;
2947unsigned long avenrun[3];
2948EXPORT_SYMBOL(avenrun);
2949
2950/**
2951 * get_avenrun - get the load average array
2952 * @loads: pointer to dest load array
2953 * @offset: offset to add
2954 * @shift: shift count to shift the result left
2955 *
2956 * These values are estimates at best, so no need for locking.
2957 */
2958void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2860{ 2959{
2861 unsigned long i, running = 0, uninterruptible = 0; 2960 loads[0] = (avenrun[0] + offset) << shift;
2961 loads[1] = (avenrun[1] + offset) << shift;
2962 loads[2] = (avenrun[2] + offset) << shift;
2963}
2862 2964
2863 for_each_online_cpu(i) { 2965static unsigned long
2864 running += cpu_rq(i)->nr_running; 2966calc_load(unsigned long load, unsigned long exp, unsigned long active)
2865 uninterruptible += cpu_rq(i)->nr_uninterruptible; 2967{
2866 } 2968 load *= exp;
2969 load += active * (FIXED_1 - exp);
2970 return load >> FSHIFT;
2971}
2867 2972
2868 if (unlikely((long)uninterruptible < 0)) 2973/*
2869 uninterruptible = 0; 2974 * calc_load - update the avenrun load estimates 10 ticks after the
2975 * CPUs have updated calc_load_tasks.
2976 */
2977void calc_global_load(void)
2978{
2979 unsigned long upd = calc_load_update + 10;
2980 long active;
2870 2981
2871 return running + uninterruptible; 2982 if (time_before(jiffies, upd))
2983 return;
2984
2985 active = atomic_long_read(&calc_load_tasks);
2986 active = active > 0 ? active * FIXED_1 : 0;
2987
2988 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2989 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2990 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2991
2992 calc_load_update += LOAD_FREQ;
2993}
2994
2995/*
2996 * Either called from update_cpu_load() or from a cpu going idle
2997 */
2998static void calc_load_account_active(struct rq *this_rq)
2999{
3000 long nr_active, delta;
3001
3002 nr_active = this_rq->nr_running;
3003 nr_active += (long) this_rq->nr_uninterruptible;
3004
3005 if (nr_active != this_rq->calc_load_active) {
3006 delta = nr_active - this_rq->calc_load_active;
3007 this_rq->calc_load_active = nr_active;
3008 atomic_long_add(delta, &calc_load_tasks);
3009 }
3010}
3011
3012/*
3013 * Externally visible per-cpu scheduler statistics:
3014 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3015 */
3016u64 cpu_nr_migrations(int cpu)
3017{
3018 return cpu_rq(cpu)->nr_migrations_in;
2872} 3019}
2873 3020
2874/* 3021/*
@@ -2899,6 +3046,11 @@ static void update_cpu_load(struct rq *this_rq)
2899 new_load += scale-1; 3046 new_load += scale-1;
2900 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3047 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901 } 3048 }
3049
3050 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3051 this_rq->calc_load_update += LOAD_FREQ;
3052 calc_load_account_active(this_rq);
3053 }
2902} 3054}
2903 3055
2904#ifdef CONFIG_SMP 3056#ifdef CONFIG_SMP
@@ -4240,6 +4392,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240static struct { 4392static struct {
4241 atomic_t load_balancer; 4393 atomic_t load_balancer;
4242 cpumask_var_t cpu_mask; 4394 cpumask_var_t cpu_mask;
4395 cpumask_var_t ilb_grp_nohz_mask;
4243} nohz ____cacheline_aligned = { 4396} nohz ____cacheline_aligned = {
4244 .load_balancer = ATOMIC_INIT(-1), 4397 .load_balancer = ATOMIC_INIT(-1),
4245}; 4398};
@@ -4249,6 +4402,121 @@ int get_nohz_load_balancer(void)
4249 return atomic_read(&nohz.load_balancer); 4402 return atomic_read(&nohz.load_balancer);
4250} 4403}
4251 4404
4405#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4406/**
4407 * lowest_flag_domain - Return lowest sched_domain containing flag.
4408 * @cpu: The cpu whose lowest level of sched domain is to
4409 * be returned.
4410 * @flag: The flag to check for the lowest sched_domain
4411 * for the given cpu.
4412 *
4413 * Returns the lowest sched_domain of a cpu which contains the given flag.
4414 */
4415static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4416{
4417 struct sched_domain *sd;
4418
4419 for_each_domain(cpu, sd)
4420 if (sd && (sd->flags & flag))
4421 break;
4422
4423 return sd;
4424}
4425
4426/**
4427 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4428 * @cpu: The cpu whose domains we're iterating over.
4429 * @sd: variable holding the value of the power_savings_sd
4430 * for cpu.
4431 * @flag: The flag to filter the sched_domains to be iterated.
4432 *
4433 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4434 * set, starting from the lowest sched_domain to the highest.
4435 */
4436#define for_each_flag_domain(cpu, sd, flag) \
4437 for (sd = lowest_flag_domain(cpu, flag); \
4438 (sd && (sd->flags & flag)); sd = sd->parent)
4439
4440/**
4441 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4442 * @ilb_group: group to be checked for semi-idleness
4443 *
4444 * Returns: 1 if the group is semi-idle. 0 otherwise.
4445 *
4446 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4447 * and atleast one non-idle CPU. This helper function checks if the given
4448 * sched_group is semi-idle or not.
4449 */
4450static inline int is_semi_idle_group(struct sched_group *ilb_group)
4451{
4452 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4453 sched_group_cpus(ilb_group));
4454
4455 /*
4456 * A sched_group is semi-idle when it has atleast one busy cpu
4457 * and atleast one idle cpu.
4458 */
4459 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4460 return 0;
4461
4462 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4463 return 0;
4464
4465 return 1;
4466}
4467/**
4468 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4469 * @cpu: The cpu which is nominating a new idle_load_balancer.
4470 *
4471 * Returns: Returns the id of the idle load balancer if it exists,
4472 * Else, returns >= nr_cpu_ids.
4473 *
4474 * This algorithm picks the idle load balancer such that it belongs to a
4475 * semi-idle powersavings sched_domain. The idea is to try and avoid
4476 * completely idle packages/cores just for the purpose of idle load balancing
4477 * when there are other idle cpu's which are better suited for that job.
4478 */
4479static int find_new_ilb(int cpu)
4480{
4481 struct sched_domain *sd;
4482 struct sched_group *ilb_group;
4483
4484 /*
4485 * Have idle load balancer selection from semi-idle packages only
4486 * when power-aware load balancing is enabled
4487 */
4488 if (!(sched_smt_power_savings || sched_mc_power_savings))
4489 goto out_done;
4490
4491 /*
4492 * Optimize for the case when we have no idle CPUs or only one
4493 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4494 */
4495 if (cpumask_weight(nohz.cpu_mask) < 2)
4496 goto out_done;
4497
4498 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4499 ilb_group = sd->groups;
4500
4501 do {
4502 if (is_semi_idle_group(ilb_group))
4503 return cpumask_first(nohz.ilb_grp_nohz_mask);
4504
4505 ilb_group = ilb_group->next;
4506
4507 } while (ilb_group != sd->groups);
4508 }
4509
4510out_done:
4511 return cpumask_first(nohz.cpu_mask);
4512}
4513#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4514static inline int find_new_ilb(int call_cpu)
4515{
4516 return cpumask_first(nohz.cpu_mask);
4517}
4518#endif
4519
4252/* 4520/*
4253 * This routine will try to nominate the ilb (idle load balancing) 4521 * This routine will try to nominate the ilb (idle load balancing)
4254 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4522 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4303,8 +4571,24 @@ int select_nohz_load_balancer(int stop_tick)
4303 /* make me the ilb owner */ 4571 /* make me the ilb owner */
4304 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4572 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4305 return 1; 4573 return 1;
4306 } else if (atomic_read(&nohz.load_balancer) == cpu) 4574 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4575 int new_ilb;
4576
4577 if (!(sched_smt_power_savings ||
4578 sched_mc_power_savings))
4579 return 1;
4580 /*
4581 * Check to see if there is a more power-efficient
4582 * ilb.
4583 */
4584 new_ilb = find_new_ilb(cpu);
4585 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4586 atomic_set(&nohz.load_balancer, -1);
4587 resched_cpu(new_ilb);
4588 return 0;
4589 }
4307 return 1; 4590 return 1;
4591 }
4308 } else { 4592 } else {
4309 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4593 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4310 return 0; 4594 return 0;
@@ -4473,15 +4757,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4473 } 4757 }
4474 4758
4475 if (atomic_read(&nohz.load_balancer) == -1) { 4759 if (atomic_read(&nohz.load_balancer) == -1) {
4476 /* 4760 int ilb = find_new_ilb(cpu);
4477 * simple selection for now: Nominate the
4478 * first cpu in the nohz list to be the next
4479 * ilb owner.
4480 *
4481 * TBD: Traverse the sched domains and nominate
4482 * the nearest cpu in the nohz.cpu_mask.
4483 */
4484 int ilb = cpumask_first(nohz.cpu_mask);
4485 4761
4486 if (ilb < nr_cpu_ids) 4762 if (ilb < nr_cpu_ids)
4487 resched_cpu(ilb); 4763 resched_cpu(ilb);
@@ -4737,7 +5013,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
4737 5013
4738 if (user_tick) 5014 if (user_tick)
4739 account_user_time(p, one_jiffy, one_jiffy_scaled); 5015 account_user_time(p, one_jiffy, one_jiffy_scaled);
4740 else if (p != rq->idle) 5016 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
4741 account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 5017 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
4742 one_jiffy_scaled); 5018 one_jiffy_scaled);
4743 else 5019 else
@@ -4845,6 +5121,8 @@ void scheduler_tick(void)
4845 curr->sched_class->task_tick(rq, curr, 0); 5121 curr->sched_class->task_tick(rq, curr, 0);
4846 spin_unlock(&rq->lock); 5122 spin_unlock(&rq->lock);
4847 5123
5124 perf_counter_task_tick(curr, cpu);
5125
4848#ifdef CONFIG_SMP 5126#ifdef CONFIG_SMP
4849 rq->idle_at_tick = idle_cpu(cpu); 5127 rq->idle_at_tick = idle_cpu(cpu);
4850 trigger_load_balance(rq, cpu); 5128 trigger_load_balance(rq, cpu);
@@ -5012,13 +5290,15 @@ pick_next_task(struct rq *rq)
5012/* 5290/*
5013 * schedule() is the main scheduler function. 5291 * schedule() is the main scheduler function.
5014 */ 5292 */
5015asmlinkage void __sched __schedule(void) 5293asmlinkage void __sched schedule(void)
5016{ 5294{
5017 struct task_struct *prev, *next; 5295 struct task_struct *prev, *next;
5018 unsigned long *switch_count; 5296 unsigned long *switch_count;
5019 struct rq *rq; 5297 struct rq *rq;
5020 int cpu; 5298 int cpu;
5021 5299
5300need_resched:
5301 preempt_disable();
5022 cpu = smp_processor_id(); 5302 cpu = smp_processor_id();
5023 rq = cpu_rq(cpu); 5303 rq = cpu_rq(cpu);
5024 rcu_qsctr_inc(cpu); 5304 rcu_qsctr_inc(cpu);
@@ -5058,6 +5338,7 @@ need_resched_nonpreemptible:
5058 5338
5059 if (likely(prev != next)) { 5339 if (likely(prev != next)) {
5060 sched_info_switch(prev, next); 5340 sched_info_switch(prev, next);
5341 perf_counter_task_sched_out(prev, next, cpu);
5061 5342
5062 rq->nr_switches++; 5343 rq->nr_switches++;
5063 rq->curr = next; 5344 rq->curr = next;
@@ -5075,15 +5356,9 @@ need_resched_nonpreemptible:
5075 5356
5076 if (unlikely(reacquire_kernel_lock(current) < 0)) 5357 if (unlikely(reacquire_kernel_lock(current) < 0))
5077 goto need_resched_nonpreemptible; 5358 goto need_resched_nonpreemptible;
5078}
5079 5359
5080asmlinkage void __sched schedule(void)
5081{
5082need_resched:
5083 preempt_disable();
5084 __schedule();
5085 preempt_enable_no_resched(); 5360 preempt_enable_no_resched();
5086 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5361 if (need_resched())
5087 goto need_resched; 5362 goto need_resched;
5088} 5363}
5089EXPORT_SYMBOL(schedule); 5364EXPORT_SYMBOL(schedule);
@@ -5226,7 +5501,7 @@ EXPORT_SYMBOL(default_wake_function);
5226 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5501 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5227 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5502 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5228 */ 5503 */
5229void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5504static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5230 int nr_exclusive, int sync, void *key) 5505 int nr_exclusive, int sync, void *key)
5231{ 5506{
5232 wait_queue_t *curr, *next; 5507 wait_queue_t *curr, *next;
@@ -5246,6 +5521,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5246 * @mode: which threads 5521 * @mode: which threads
5247 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5522 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5248 * @key: is directly passed to the wakeup function 5523 * @key: is directly passed to the wakeup function
5524 *
5525 * It may be assumed that this function implies a write memory barrier before
5526 * changing the task state if and only if any tasks are woken up.
5249 */ 5527 */
5250void __wake_up(wait_queue_head_t *q, unsigned int mode, 5528void __wake_up(wait_queue_head_t *q, unsigned int mode,
5251 int nr_exclusive, void *key) 5529 int nr_exclusive, void *key)
@@ -5284,6 +5562,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5284 * with each other. This can prevent needless bouncing between CPUs. 5562 * with each other. This can prevent needless bouncing between CPUs.
5285 * 5563 *
5286 * On UP it can prevent extra preemption. 5564 * On UP it can prevent extra preemption.
5565 *
5566 * It may be assumed that this function implies a write memory barrier before
5567 * changing the task state if and only if any tasks are woken up.
5287 */ 5568 */
5288void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 5569void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5289 int nr_exclusive, void *key) 5570 int nr_exclusive, void *key)
@@ -5320,6 +5601,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5320 * awakened in the same order in which they were queued. 5601 * awakened in the same order in which they were queued.
5321 * 5602 *
5322 * See also complete_all(), wait_for_completion() and related routines. 5603 * See also complete_all(), wait_for_completion() and related routines.
5604 *
5605 * It may be assumed that this function implies a write memory barrier before
5606 * changing the task state if and only if any tasks are woken up.
5323 */ 5607 */
5324void complete(struct completion *x) 5608void complete(struct completion *x)
5325{ 5609{
@@ -5337,6 +5621,9 @@ EXPORT_SYMBOL(complete);
5337 * @x: holds the state of this particular completion 5621 * @x: holds the state of this particular completion
5338 * 5622 *
5339 * This will wake up all threads waiting on this particular completion event. 5623 * This will wake up all threads waiting on this particular completion event.
5624 *
5625 * It may be assumed that this function implies a write memory barrier before
5626 * changing the task state if and only if any tasks are woken up.
5340 */ 5627 */
5341void complete_all(struct completion *x) 5628void complete_all(struct completion *x)
5342{ 5629{
@@ -6495,8 +6782,9 @@ void sched_show_task(struct task_struct *p)
6495#ifdef CONFIG_DEBUG_STACK_USAGE 6782#ifdef CONFIG_DEBUG_STACK_USAGE
6496 free = stack_not_used(p); 6783 free = stack_not_used(p);
6497#endif 6784#endif
6498 printk(KERN_CONT "%5lu %5d %6d\n", free, 6785 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6499 task_pid_nr(p), task_pid_nr(p->real_parent)); 6786 task_pid_nr(p), task_pid_nr(p->real_parent),
6787 (unsigned long)task_thread_info(p)->flags);
6500 6788
6501 show_stack(p, NULL); 6789 show_stack(p, NULL);
6502} 6790}
@@ -6975,6 +7263,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6975 7263
6976 } 7264 }
6977} 7265}
7266
7267/*
7268 * remove the tasks which were accounted by rq from calc_load_tasks.
7269 */
7270static void calc_global_load_remove(struct rq *rq)
7271{
7272 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7273}
6978#endif /* CONFIG_HOTPLUG_CPU */ 7274#endif /* CONFIG_HOTPLUG_CPU */
6979 7275
6980#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7276#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7209,6 +7505,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7209 /* Update our root-domain */ 7505 /* Update our root-domain */
7210 rq = cpu_rq(cpu); 7506 rq = cpu_rq(cpu);
7211 spin_lock_irqsave(&rq->lock, flags); 7507 spin_lock_irqsave(&rq->lock, flags);
7508 rq->calc_load_update = calc_load_update;
7509 rq->calc_load_active = 0;
7212 if (rq->rd) { 7510 if (rq->rd) {
7213 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7511 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7214 7512
@@ -7248,7 +7546,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7248 cpuset_unlock(); 7546 cpuset_unlock();
7249 migrate_nr_uninterruptible(rq); 7547 migrate_nr_uninterruptible(rq);
7250 BUG_ON(rq->nr_running != 0); 7548 BUG_ON(rq->nr_running != 0);
7251 7549 calc_global_load_remove(rq);
7252 /* 7550 /*
7253 * No need to migrate the tasks: it was best-effort if 7551 * No need to migrate the tasks: it was best-effort if
7254 * they didn't take sched_hotcpu_mutex. Just wake up 7552 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7284,8 +7582,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7284 return NOTIFY_OK; 7582 return NOTIFY_OK;
7285} 7583}
7286 7584
7287/* Register at highest priority so that task migration (migrate_all_tasks) 7585/*
7288 * happens before everything else. 7586 * Register at high priority so that task migration (migrate_all_tasks)
7587 * happens before everything else. This has to be lower priority than
7588 * the notifier in the perf_counter subsystem, though.
7289 */ 7589 */
7290static struct notifier_block __cpuinitdata migration_notifier = { 7590static struct notifier_block __cpuinitdata migration_notifier = {
7291 .notifier_call = migration_call, 7591 .notifier_call = migration_call,
@@ -7530,24 +7830,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7530 7830
7531static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7831static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
7532{ 7832{
7833 gfp_t gfp = GFP_KERNEL;
7834
7533 memset(rd, 0, sizeof(*rd)); 7835 memset(rd, 0, sizeof(*rd));
7534 7836
7535 if (bootmem) { 7837 if (bootmem)
7536 alloc_bootmem_cpumask_var(&def_root_domain.span); 7838 gfp = GFP_NOWAIT;
7537 alloc_bootmem_cpumask_var(&def_root_domain.online);
7538 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
7539 cpupri_init(&rd->cpupri, true);
7540 return 0;
7541 }
7542 7839
7543 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 7840 if (!alloc_cpumask_var(&rd->span, gfp))
7544 goto out; 7841 goto out;
7545 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 7842 if (!alloc_cpumask_var(&rd->online, gfp))
7546 goto free_span; 7843 goto free_span;
7547 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 7844 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
7548 goto free_online; 7845 goto free_online;
7549 7846
7550 if (cpupri_init(&rd->cpupri, false) != 0) 7847 if (cpupri_init(&rd->cpupri, bootmem) != 0)
7551 goto free_rto_mask; 7848 goto free_rto_mask;
7552 return 0; 7849 return 0;
7553 7850
@@ -7758,8 +8055,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7758 8055
7759/* 8056/*
7760 * The cpus mask in sched_group and sched_domain hangs off the end. 8057 * The cpus mask in sched_group and sched_domain hangs off the end.
7761 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space 8058 *
7762 * for nr_cpu_ids < CONFIG_NR_CPUS. 8059 * ( See the the comments in include/linux/sched.h:struct sched_group
8060 * and struct sched_domain. )
7763 */ 8061 */
7764struct static_sched_group { 8062struct static_sched_group {
7765 struct sched_group sg; 8063 struct sched_group sg;
@@ -7880,7 +8178,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7880 struct sched_domain *sd; 8178 struct sched_domain *sd;
7881 8179
7882 sd = &per_cpu(phys_domains, j).sd; 8180 sd = &per_cpu(phys_domains, j).sd;
7883 if (j != cpumask_first(sched_group_cpus(sd->groups))) { 8181 if (j != group_first_cpu(sd->groups)) {
7884 /* 8182 /*
7885 * Only add "power" once for each 8183 * Only add "power" once for each
7886 * physical package. 8184 * physical package.
@@ -7958,7 +8256,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7958 8256
7959 WARN_ON(!sd || !sd->groups); 8257 WARN_ON(!sd || !sd->groups);
7960 8258
7961 if (cpu != cpumask_first(sched_group_cpus(sd->groups))) 8259 if (cpu != group_first_cpu(sd->groups))
7962 return; 8260 return;
7963 8261
7964 child = sd->child; 8262 child = sd->child;
@@ -8872,7 +9170,7 @@ void __init sched_init(void)
8872 * we use alloc_bootmem(). 9170 * we use alloc_bootmem().
8873 */ 9171 */
8874 if (alloc_size) { 9172 if (alloc_size) {
8875 ptr = (unsigned long)alloc_bootmem(alloc_size); 9173 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8876 9174
8877#ifdef CONFIG_FAIR_GROUP_SCHED 9175#ifdef CONFIG_FAIR_GROUP_SCHED
8878 init_task_group.se = (struct sched_entity **)ptr; 9176 init_task_group.se = (struct sched_entity **)ptr;
@@ -8945,6 +9243,8 @@ void __init sched_init(void)
8945 rq = cpu_rq(i); 9243 rq = cpu_rq(i);
8946 spin_lock_init(&rq->lock); 9244 spin_lock_init(&rq->lock);
8947 rq->nr_running = 0; 9245 rq->nr_running = 0;
9246 rq->calc_load_active = 0;
9247 rq->calc_load_update = jiffies + LOAD_FREQ;
8948 init_cfs_rq(&rq->cfs, rq); 9248 init_cfs_rq(&rq->cfs, rq);
8949 init_rt_rq(&rq->rt, rq); 9249 init_rt_rq(&rq->rt, rq);
8950#ifdef CONFIG_FAIR_GROUP_SCHED 9250#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8965,7 +9265,7 @@ void __init sched_init(void)
8965 * 1024) and two child groups A0 and A1 (of weight 1024 each), 9265 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8966 * then A0's share of the cpu resource is: 9266 * then A0's share of the cpu resource is:
8967 * 9267 *
8968 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 9268 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8969 * 9269 *
8970 * We achieve this by letting init_task_group's tasks sit 9270 * We achieve this by letting init_task_group's tasks sit
8971 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 9271 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9052,20 +9352,26 @@ void __init sched_init(void)
9052 * when this runqueue becomes "idle". 9352 * when this runqueue becomes "idle".
9053 */ 9353 */
9054 init_idle(current, smp_processor_id()); 9354 init_idle(current, smp_processor_id());
9355
9356 calc_load_update = jiffies + LOAD_FREQ;
9357
9055 /* 9358 /*
9056 * During early bootup we pretend to be a normal task: 9359 * During early bootup we pretend to be a normal task:
9057 */ 9360 */
9058 current->sched_class = &fair_sched_class; 9361 current->sched_class = &fair_sched_class;
9059 9362
9060 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9363 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9061 alloc_bootmem_cpumask_var(&nohz_cpu_mask); 9364 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9062#ifdef CONFIG_SMP 9365#ifdef CONFIG_SMP
9063#ifdef CONFIG_NO_HZ 9366#ifdef CONFIG_NO_HZ
9064 alloc_bootmem_cpumask_var(&nohz.cpu_mask); 9367 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9368 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9065#endif 9369#endif
9066 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9370 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9067#endif /* SMP */ 9371#endif /* SMP */
9068 9372
9373 perf_counter_init();
9374
9069 scheduler_running = 1; 9375 scheduler_running = 1;
9070} 9376}
9071 9377
@@ -9807,6 +10113,13 @@ static int sched_rt_global_constraints(void)
9807 if (sysctl_sched_rt_period <= 0) 10113 if (sysctl_sched_rt_period <= 0)
9808 return -EINVAL; 10114 return -EINVAL;
9809 10115
10116 /*
10117 * There's always some RT tasks in the root group
10118 * -- migration, kstopmachine etc..
10119 */
10120 if (sysctl_sched_rt_runtime == 0)
10121 return -EBUSY;
10122
9810 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10123 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9811 for_each_possible_cpu(i) { 10124 for_each_possible_cpu(i) {
9812 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10125 struct rt_rq *rt_rq = &cpu_rq(i)->rt;