1 files changed, 244 insertions, 35 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..228acae8821f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
        struct list_head migration_queue;
 #endif
+        /* calc_load related fields */
+        unsigned long calc_load_update;
+        long calc_load_active;
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
        int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
+static void calc_load_account_active(struct rq *this_rq);
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2856,19 +2862,72 @@ unsigned long nr_iowait(void)
        return sum;
 }
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 {
-        unsigned long i, running = 0, uninterruptible = 0;
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
-        for_each_online_cpu(i) {
+static unsigned long
-                running += cpu_rq(i)->nr_running;
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
-                uninterruptible += cpu_rq(i)->nr_uninterruptible;
+{
-        }
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        return load >> FSHIFT;
+}
-        if (unlikely((long)uninterruptible < 0))
+/*
-                uninterruptible = 0;
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+        unsigned long upd = calc_load_update + 10;
+        long active;
-        return running + uninterruptible;
+        if (time_before(jiffies, upd))
+                return;
+        active = atomic_long_read(&calc_load_tasks);
+        active = active > 0 ? active * FIXED_1 : 0;
+        avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+        avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+        calc_load_update += LOAD_FREQ;
+}
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+        long nr_active, delta;
+        nr_active = this_rq->nr_running;
+        nr_active += (long) this_rq->nr_uninterruptible;
+        if (nr_active != this_rq->calc_load_active) {
+                delta = nr_active - this_rq->calc_load_active;
+                this_rq->calc_load_active = nr_active;
+                atomic_long_add(delta, &calc_load_tasks);
+        }
 }
 /*
@@ -2899,6 +2958,11 @@ static void update_cpu_load(struct rq *this_rq)
                        new_load += scale-1;
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
+        if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+                this_rq->calc_load_update += LOAD_FREQ;
+                calc_load_account_active(this_rq);
+        }
 }
 #ifdef CONFIG_SMP
@@ -4240,10 +4304,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 static struct {
        atomic_t load_balancer;
        cpumask_var_t cpu_mask;
+        cpumask_var_t ilb_grp_nohz_mask;
 } nohz ____cacheline_aligned = {
        .load_balancer = ATOMIC_INIT(-1),
 };
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:        The cpu whose lowest level of sched domain is to
+ *              be returned.
+ * @flag:       The flag to check for the lowest sched_domain
+ *              for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd)
+                if (sd && (sd->flags & flag))
+                        break;
+        return sd;
+}
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:        The cpu whose domains we're iterating over.
+ * @sd:         variable holding the value of the power_savings_sd
+ *              for cpu.
+ * @flag:       The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+        for (sd = lowest_flag_domain(cpu, flag); \
+                (sd && (sd->flags & flag)); sd = sd->parent)
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:  group to be checked for semi-idleness
+ *
+ * Returns:     1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                        sched_group_cpus(ilb_group));
+        /*
+         * A sched_group is semi-idle when it has atleast one busy cpu
+         * and atleast one idle cpu.
+         */
+        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+                return 0;
+        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+                return 0;
+        return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:        The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:     Returns the id of the idle load balancer if it exists,
+ *              Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+        struct sched_domain *sd;
+        struct sched_group *ilb_group;
+        /*
+         * Have idle load balancer selection from semi-idle packages only
+         * when power-aware load balancing is enabled
+         */
+        if (!(sched_smt_power_savings || sched_mc_power_savings))
+                goto out_done;
+        /*
+         * Optimize for the case when we have no idle CPUs or only one
+         * idle CPU. Don't walk the sched_domain hierarchy in such cases
+         */
+        if (cpumask_weight(nohz.cpu_mask) < 2)
+                goto out_done;
+        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+                ilb_group = sd->groups;
+                do {
+                        if (is_semi_idle_group(ilb_group))
+                                return cpumask_first(nohz.ilb_grp_nohz_mask);
+                        ilb_group = ilb_group->next;
+                } while (ilb_group != sd->groups);
+        }
+out_done:
+        return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+        return cpumask_first(nohz.cpu_mask);
+}
+#endif
 /*
 * This routine will try to nominate the ilb (idle load balancing)
 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4478,24 @@ int select_nohz_load_balancer(int stop_tick)
                        /* make me the ilb owner */
                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
                                return 1;
-                } else if (atomic_read(&nohz.load_balancer) == cpu)
+                } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                        int new_ilb;
+                        if (!(sched_smt_power_savings ||
+                                                sched_mc_power_savings))
+                                return 1;
+                        /*
+                         * Check to see if there is a more power-efficient
+                         * ilb.
+                         */
+                        new_ilb = find_new_ilb(cpu);
+                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                                atomic_set(&nohz.load_balancer, -1);
+                                resched_cpu(new_ilb);
+                                return 0;
+                        }
                        return 1;
+                }
        } else {
                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                        return 0;
@@ -4468,15 +4664,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                }
                if (atomic_read(&nohz.load_balancer) == -1) {
-                        /*
+                        int ilb = find_new_ilb(cpu);
-                         * simple selection for now: Nominate the
-                         * first cpu in the nohz list to be the next
-                         * ilb owner.
-                         *
-                         * TBD: Traverse the sched domains and nominate
-                         * the nearest cpu in the nohz.cpu_mask.
-                         */
-                        int ilb = cpumask_first(nohz.cpu_mask);
                        if (ilb < nr_cpu_ids)
                                resched_cpu(ilb);
@@ -5007,13 +5195,15 @@ pick_next_task(struct rq *rq)
 /*
 * schedule() is the main scheduler function.
 */
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
+need_resched:
+        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_qsctr_inc(cpu);
@@ -5070,15 +5260,9 @@ need_resched_nonpreemptible:
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
-}
-asmlinkage void __sched schedule(void)
-{
-need_resched:
-        preempt_disable();
-        __schedule();
        preempt_enable_no_resched();
-        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+        if (need_resched())
                goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
@@ -5221,7 +5405,7 @@ EXPORT_SYMBOL(default_wake_function);
 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
 */
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, int sync, void *key)
 {
        wait_queue_t *curr, *next;
@@ -6490,8 +6674,9 @@ void sched_show_task(struct task_struct *p)
 #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
 #endif
-        printk(KERN_CONT "%5lu %5d %6d\n", free,
+        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-                task_pid_nr(p), task_pid_nr(p->real_parent));
+                task_pid_nr(p), task_pid_nr(p->real_parent),
+                (unsigned long)task_thread_info(p)->flags);
        show_stack(p, NULL);
 }
@@ -6970,6 +7155,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
        }
 }
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7397,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                /* Update our root-domain */
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
+                rq->calc_load_update = calc_load_update;
+                rq->calc_load_active = 0;
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7243,7 +7438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
+                calc_global_load_remove(rq);
                /*
                 * No need to migrate the tasks: it was best-effort if
                 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7753,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
 * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+ *
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
 */
 struct static_sched_group {
        struct sched_group sg;
@@ -7875,7 +8071,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                        struct sched_domain *sd;
                        sd = &per_cpu(phys_domains, j).sd;
-                        if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+                        if (j != group_first_cpu(sd->groups)) {
                                /*
                                 * Only add "power" once for each
                                 * physical package.
@@ -7953,7 +8149,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        WARN_ON(!sd || !sd->groups);
-        if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+        if (cpu != group_first_cpu(sd->groups))
                return;
        child = sd->child;
@@ -8938,6 +9134,8 @@ void __init sched_init(void)
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
                rq->nr_running = 0;
+                rq->calc_load_active = 0;
+                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9045,6 +9243,9 @@ void __init sched_init(void)
         * when this runqueue becomes "idle".
         */
        init_idle(current, smp_processor_id());
+        calc_load_update = jiffies + LOAD_FREQ;
        /*
         * During early bootup we pretend to be a normal task:
         */
@@ -9055,6 +9256,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
        alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+        alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
 #endif
        alloc_bootmem_cpumask_var(&cpu_isolated_map);
 #endif /* SMP */
@@ -9800,6 +10002,13 @@ static int sched_rt_global_constraints(void)
        if (sysctl_sched_rt_period <= 0)
                return -EINVAL;
+        /*
+         * There's always some RT tasks in the root group
+         * -- migration, kstopmachine etc..
+         */
+        if (sysctl_sched_rt_runtime == 0)
+                return -EBUSY;
        spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;

diff --git a/kernel/sched.c b/kernel/sched.c index 26efa475bdc1..228acae8821f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
630	struct list_head migration_queue;	630	struct list_head migration_queue;
631	#endif	631	#endif
632		632
		633	/* calc_load related fields */
		634	unsigned long calc_load_update;
		635	long calc_load_active;
		636
633	#ifdef CONFIG_SCHED_HRTICK	637	#ifdef CONFIG_SCHED_HRTICK
634	#ifdef CONFIG_SMP	638	#ifdef CONFIG_SMP
635	int hrtick_csd_pending;	639	int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728	}	1732	}
1729	#endif	1733	#endif
1730		1734
		1735	static void calc_load_account_active(struct rq *this_rq);
		1736
1731	#include "sched_stats.h"	1737	#include "sched_stats.h"
1732	#include "sched_idletask.c"	1738	#include "sched_idletask.c"
1733	#include "sched_fair.c"	1739	#include "sched_fair.c"
@@ -2856,19 +2862,72 @@ unsigned long nr_iowait(void)
2856	return sum;	2862	return sum;
2857	}	2863	}
2858		2864
2859	unsigned long nr_active(void)	2865	/* Variables and functions for calc_load */
		2866	static atomic_long_t calc_load_tasks;
		2867	static unsigned long calc_load_update;
		2868	unsigned long avenrun[3];
		2869	EXPORT_SYMBOL(avenrun);
		2870
		2871	/**
		2872	* get_avenrun - get the load average array
		2873	* @loads: pointer to dest load array
		2874	* @offset: offset to add
		2875	* @shift: shift count to shift the result left
		2876	*
		2877	* These values are estimates at best, so no need for locking.
		2878	*/
		2879	void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2860	{	2880	{
2861	unsigned long i, running = 0, uninterruptible = 0;	2881	loads[0] = (avenrun[0] + offset) << shift;
		2882	loads[1] = (avenrun[1] + offset) << shift;
		2883	loads[2] = (avenrun[2] + offset) << shift;
		2884	}
2862		2885
2863	for_each_online_cpu(i) {	2886	static unsigned long
2864	running += cpu_rq(i)->nr_running;	2887	calc_load(unsigned long load, unsigned long exp, unsigned long active)
2865	uninterruptible += cpu_rq(i)->nr_uninterruptible;	2888	{
2866	}	2889	load *= exp;
		2890	load += active * (FIXED_1 - exp);
		2891	return load >> FSHIFT;
		2892	}
2867		2893
2868	if (unlikely((long)uninterruptible < 0))	2894	/*
2869	uninterruptible = 0;	2895	* calc_load - update the avenrun load estimates 10 ticks after the
		2896	* CPUs have updated calc_load_tasks.
		2897	*/
		2898	void calc_global_load(void)
		2899	{
		2900	unsigned long upd = calc_load_update + 10;
		2901	long active;
2870		2902
2871	return running + uninterruptible;	2903	if (time_before(jiffies, upd))
		2904	return;
		2905
		2906	active = atomic_long_read(&calc_load_tasks);
		2907	active = active > 0 ? active * FIXED_1 : 0;
		2908
		2909	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
		2910	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
		2911	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
		2912
		2913	calc_load_update += LOAD_FREQ;
		2914	}
		2915
		2916	/*
		2917	* Either called from update_cpu_load() or from a cpu going idle
		2918	*/
		2919	static void calc_load_account_active(struct rq *this_rq)
		2920	{
		2921	long nr_active, delta;
		2922
		2923	nr_active = this_rq->nr_running;
		2924	nr_active += (long) this_rq->nr_uninterruptible;
		2925
		2926	if (nr_active != this_rq->calc_load_active) {
		2927	delta = nr_active - this_rq->calc_load_active;
		2928	this_rq->calc_load_active = nr_active;
		2929	atomic_long_add(delta, &calc_load_tasks);
		2930	}
2872	}	2931	}
2873		2932
2874	/*	2933	/*
@@ -2899,6 +2958,11 @@ static void update_cpu_load(struct rq *this_rq)
2899	new_load += scale-1;	2958	new_load += scale-1;
2900	this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;	2959	this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2901	}	2960	}
		2961
		2962	if (time_after_eq(jiffies, this_rq->calc_load_update)) {
		2963	this_rq->calc_load_update += LOAD_FREQ;
		2964	calc_load_account_active(this_rq);
		2965	}
2902	}	2966	}
2903		2967
2904	#ifdef CONFIG_SMP	2968	#ifdef CONFIG_SMP
@@ -4240,10 +4304,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4240	static struct {	4304	static struct {
4241	atomic_t load_balancer;	4305	atomic_t load_balancer;
4242	cpumask_var_t cpu_mask;	4306	cpumask_var_t cpu_mask;
		4307	cpumask_var_t ilb_grp_nohz_mask;
4243	} nohz ____cacheline_aligned = {	4308	} nohz ____cacheline_aligned = {
4244	.load_balancer = ATOMIC_INIT(-1),	4309	.load_balancer = ATOMIC_INIT(-1),
4245	};	4310	};
4246		4311
		4312	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
		4313	/**
		4314	* lowest_flag_domain - Return lowest sched_domain containing flag.
		4315	* @cpu: The cpu whose lowest level of sched domain is to
		4316	* be returned.
		4317	* @flag: The flag to check for the lowest sched_domain
		4318	* for the given cpu.
		4319	*
		4320	* Returns the lowest sched_domain of a cpu which contains the given flag.
		4321	*/
		4322	static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
		4323	{
		4324	struct sched_domain *sd;
		4325
		4326	for_each_domain(cpu, sd)
		4327	if (sd && (sd->flags & flag))
		4328	break;
		4329
		4330	return sd;
		4331	}
		4332
		4333	/**
		4334	* for_each_flag_domain - Iterates over sched_domains containing the flag.
		4335	* @cpu: The cpu whose domains we're iterating over.
		4336	* @sd: variable holding the value of the power_savings_sd
		4337	* for cpu.
		4338	* @flag: The flag to filter the sched_domains to be iterated.
		4339	*
		4340	* Iterates over all the scheduler domains for a given cpu that has the 'flag'
		4341	* set, starting from the lowest sched_domain to the highest.
		4342	*/
		4343	#define for_each_flag_domain(cpu, sd, flag) \
		4344	for (sd = lowest_flag_domain(cpu, flag); \
		4345	(sd && (sd->flags & flag)); sd = sd->parent)
		4346
		4347	/**
		4348	* is_semi_idle_group - Checks if the given sched_group is semi-idle.
		4349	* @ilb_group: group to be checked for semi-idleness
		4350	*
		4351	* Returns: 1 if the group is semi-idle. 0 otherwise.
		4352	*
		4353	* We define a sched_group to be semi idle if it has atleast one idle-CPU
		4354	* and atleast one non-idle CPU. This helper function checks if the given
		4355	* sched_group is semi-idle or not.
		4356	*/
		4357	static inline int is_semi_idle_group(struct sched_group *ilb_group)
		4358	{
		4359	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
		4360	sched_group_cpus(ilb_group));
		4361
		4362	/*
		4363	* A sched_group is semi-idle when it has atleast one busy cpu
		4364	* and atleast one idle cpu.
		4365	*/
		4366	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
		4367	return 0;
		4368
		4369	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
		4370	return 0;
		4371
		4372	return 1;
		4373	}
		4374	/**
		4375	* find_new_ilb - Finds the optimum idle load balancer for nomination.
		4376	* @cpu: The cpu which is nominating a new idle_load_balancer.
		4377	*
		4378	* Returns: Returns the id of the idle load balancer if it exists,
		4379	* Else, returns >= nr_cpu_ids.
		4380	*
		4381	* This algorithm picks the idle load balancer such that it belongs to a
		4382	* semi-idle powersavings sched_domain. The idea is to try and avoid
		4383	* completely idle packages/cores just for the purpose of idle load balancing
		4384	* when there are other idle cpu's which are better suited for that job.
		4385	*/
		4386	static int find_new_ilb(int cpu)
		4387	{
		4388	struct sched_domain *sd;
		4389	struct sched_group *ilb_group;
		4390
		4391	/*
		4392	* Have idle load balancer selection from semi-idle packages only
		4393	* when power-aware load balancing is enabled
		4394	*/
		4395	if (!(sched_smt_power_savings \|\| sched_mc_power_savings))
		4396	goto out_done;
		4397
		4398	/*
		4399	* Optimize for the case when we have no idle CPUs or only one
		4400	* idle CPU. Don't walk the sched_domain hierarchy in such cases
		4401	*/
		4402	if (cpumask_weight(nohz.cpu_mask) < 2)
		4403	goto out_done;
		4404
		4405	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
		4406	ilb_group = sd->groups;
		4407
		4408	do {
		4409	if (is_semi_idle_group(ilb_group))
		4410	return cpumask_first(nohz.ilb_grp_nohz_mask);
		4411
		4412	ilb_group = ilb_group->next;
		4413
		4414	} while (ilb_group != sd->groups);
		4415	}
		4416
		4417	out_done:
		4418	return cpumask_first(nohz.cpu_mask);
		4419	}
		4420	#else /* (CONFIG_SCHED_MC \|\| CONFIG_SCHED_SMT) */
		4421	static inline int find_new_ilb(int call_cpu)
		4422	{
		4423	return cpumask_first(nohz.cpu_mask);
		4424	}
		4425	#endif
		4426
4247	/*	4427	/*
4248	* This routine will try to nominate the ilb (idle load balancing)	4428	* This routine will try to nominate the ilb (idle load balancing)
4249	* owner among the cpus whose ticks are stopped. ilb owner will do the idle	4429	* owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4478,24 @@ int select_nohz_load_balancer(int stop_tick)
4298	/* make me the ilb owner */	4478	/* make me the ilb owner */
4299	if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)	4479	if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4300	return 1;	4480	return 1;
4301	} else if (atomic_read(&nohz.load_balancer) == cpu)	4481	} else if (atomic_read(&nohz.load_balancer) == cpu) {
		4482	int new_ilb;
		4483
		4484	if (!(sched_smt_power_savings \|\|
		4485	sched_mc_power_savings))
		4486	return 1;
		4487	/*
		4488	* Check to see if there is a more power-efficient
		4489	* ilb.
		4490	*/
		4491	new_ilb = find_new_ilb(cpu);
		4492	if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
		4493	atomic_set(&nohz.load_balancer, -1);
		4494	resched_cpu(new_ilb);
		4495	return 0;
		4496	}
4302	return 1;	4497	return 1;
		4498	}
4303	} else {	4499	} else {
4304	if (!cpumask_test_cpu(cpu, nohz.cpu_mask))	4500	if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4305	return 0;	4501	return 0;
@@ -4468,15 +4664,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4468	}	4664	}
4469		4665
4470	if (atomic_read(&nohz.load_balancer) == -1) {	4666	if (atomic_read(&nohz.load_balancer) == -1) {
4471	/*	4667	int ilb = find_new_ilb(cpu);
4472	* simple selection for now: Nominate the
4473	* first cpu in the nohz list to be the next
4474	* ilb owner.
4475	*
4476	* TBD: Traverse the sched domains and nominate
4477	* the nearest cpu in the nohz.cpu_mask.
4478	*/
4479	int ilb = cpumask_first(nohz.cpu_mask);
4480		4668
4481	if (ilb < nr_cpu_ids)	4669	if (ilb < nr_cpu_ids)
4482	resched_cpu(ilb);	4670	resched_cpu(ilb);
@@ -5007,13 +5195,15 @@ pick_next_task(struct rq *rq)
5007	/*	5195	/*
5008	* schedule() is the main scheduler function.	5196	* schedule() is the main scheduler function.
5009	*/	5197	*/
5010	asmlinkage void __sched __schedule(void)	5198	asmlinkage void __sched schedule(void)
5011	{	5199	{
5012	struct task_struct prev, next;	5200	struct task_struct prev, next;
5013	unsigned long *switch_count;	5201	unsigned long *switch_count;
5014	struct rq *rq;	5202	struct rq *rq;
5015	int cpu;	5203	int cpu;
5016		5204
		5205	need_resched:
		5206	preempt_disable();
5017	cpu = smp_processor_id();	5207	cpu = smp_processor_id();
5018	rq = cpu_rq(cpu);	5208	rq = cpu_rq(cpu);
5019	rcu_qsctr_inc(cpu);	5209	rcu_qsctr_inc(cpu);
@@ -5070,15 +5260,9 @@ need_resched_nonpreemptible:
5070		5260
5071	if (unlikely(reacquire_kernel_lock(current) < 0))	5261	if (unlikely(reacquire_kernel_lock(current) < 0))
5072	goto need_resched_nonpreemptible;	5262	goto need_resched_nonpreemptible;
5073	}
5074		5263
5075	asmlinkage void __sched schedule(void)
5076	{
5077	need_resched:
5078	preempt_disable();
5079	__schedule();
5080	preempt_enable_no_resched();	5264	preempt_enable_no_resched();
5081	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))	5265	if (need_resched())
5082	goto need_resched;	5266	goto need_resched;
5083	}	5267	}
5084	EXPORT_SYMBOL(schedule);	5268	EXPORT_SYMBOL(schedule);
@@ -5221,7 +5405,7 @@ EXPORT_SYMBOL(default_wake_function);
5221	* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns	5405	* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5222	* zero in this (rare) case, and we handle it by continuing to scan the queue.	5406	* zero in this (rare) case, and we handle it by continuing to scan the queue.
5223	*/	5407	*/
5224	void __wake_up_common(wait_queue_head_t *q, unsigned int mode,	5408	static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5225	int nr_exclusive, int sync, void *key)	5409	int nr_exclusive, int sync, void *key)
5226	{	5410	{
5227	wait_queue_t curr, next;	5411	wait_queue_t curr, next;
@@ -6490,8 +6674,9 @@ void sched_show_task(struct task_struct *p)
6490	#ifdef CONFIG_DEBUG_STACK_USAGE	6674	#ifdef CONFIG_DEBUG_STACK_USAGE
6491	free = stack_not_used(p);	6675	free = stack_not_used(p);
6492	#endif	6676	#endif
6493	printk(KERN_CONT "%5lu %5d %6d\n", free,	6677	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6494	task_pid_nr(p), task_pid_nr(p->real_parent));	6678	task_pid_nr(p), task_pid_nr(p->real_parent),
		6679	(unsigned long)task_thread_info(p)->flags);
6495		6680
6496	show_stack(p, NULL);	6681	show_stack(p, NULL);
6497	}	6682	}
@@ -6970,6 +7155,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6970		7155
6971	}	7156	}
6972	}	7157	}
		7158
		7159	/*
		7160	* remove the tasks which were accounted by rq from calc_load_tasks.
		7161	*/
		7162	static void calc_global_load_remove(struct rq *rq)
		7163	{
		7164	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
		7165	}
6973	#endif /* CONFIG_HOTPLUG_CPU */	7166	#endif /* CONFIG_HOTPLUG_CPU */
6974		7167
6975	#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)	7168	#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7397,8 @@ migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
7204	/* Update our root-domain */	7397	/* Update our root-domain */
7205	rq = cpu_rq(cpu);	7398	rq = cpu_rq(cpu);
7206	spin_lock_irqsave(&rq->lock, flags);	7399	spin_lock_irqsave(&rq->lock, flags);
		7400	rq->calc_load_update = calc_load_update;
		7401	rq->calc_load_active = 0;
7207	if (rq->rd) {	7402	if (rq->rd) {
7208	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));	7403	BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7209		7404
@@ -7243,7 +7438,7 @@ migration_call(struct notifier_block nfb, unsigned long action, void hcpu)
7243	cpuset_unlock();	7438	cpuset_unlock();
7244	migrate_nr_uninterruptible(rq);	7439	migrate_nr_uninterruptible(rq);
7245	BUG_ON(rq->nr_running != 0);	7440	BUG_ON(rq->nr_running != 0);
7246		7441	calc_global_load_remove(rq);
7247	/*	7442	/*
7248	* No need to migrate the tasks: it was best-effort if	7443	* No need to migrate the tasks: it was best-effort if
7249	* they didn't take sched_hotcpu_mutex. Just wake up	7444	* they didn't take sched_hotcpu_mutex. Just wake up
@@ -7753,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7753		7948
7754	/*	7949	/*
7755	* The cpus mask in sched_group and sched_domain hangs off the end.	7950	* The cpus mask in sched_group and sched_domain hangs off the end.
7756	* FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space	7951	*
7757	* for nr_cpu_ids < CONFIG_NR_CPUS.	7952	* ( See the the comments in include/linux/sched.h:struct sched_group
		7953	* and struct sched_domain. )
7758	*/	7954	*/
7759	struct static_sched_group {	7955	struct static_sched_group {
7760	struct sched_group sg;	7956	struct sched_group sg;
@@ -7875,7 +8071,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7875	struct sched_domain *sd;	8071	struct sched_domain *sd;
7876		8072
7877	sd = &per_cpu(phys_domains, j).sd;	8073	sd = &per_cpu(phys_domains, j).sd;
7878	if (j != cpumask_first(sched_group_cpus(sd->groups))) {	8074	if (j != group_first_cpu(sd->groups)) {
7879	/*	8075	/*
7880	* Only add "power" once for each	8076	* Only add "power" once for each
7881	* physical package.	8077	* physical package.
@@ -7953,7 +8149,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7953		8149
7954	WARN_ON(!sd \|\| !sd->groups);	8150	WARN_ON(!sd \|\| !sd->groups);
7955		8151
7956	if (cpu != cpumask_first(sched_group_cpus(sd->groups)))	8152	if (cpu != group_first_cpu(sd->groups))
7957	return;	8153	return;
7958		8154
7959	child = sd->child;	8155	child = sd->child;
@@ -8938,6 +9134,8 @@ void __init sched_init(void)
8938	rq = cpu_rq(i);	9134	rq = cpu_rq(i);
8939	spin_lock_init(&rq->lock);	9135	spin_lock_init(&rq->lock);
8940	rq->nr_running = 0;	9136	rq->nr_running = 0;
		9137	rq->calc_load_active = 0;
		9138	rq->calc_load_update = jiffies + LOAD_FREQ;
8941	init_cfs_rq(&rq->cfs, rq);	9139	init_cfs_rq(&rq->cfs, rq);
8942	init_rt_rq(&rq->rt, rq);	9140	init_rt_rq(&rq->rt, rq);
8943	#ifdef CONFIG_FAIR_GROUP_SCHED	9141	#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9045,6 +9243,9 @@ void __init sched_init(void)
9045	* when this runqueue becomes "idle".	9243	* when this runqueue becomes "idle".
9046	*/	9244	*/
9047	init_idle(current, smp_processor_id());	9245	init_idle(current, smp_processor_id());
		9246
		9247	calc_load_update = jiffies + LOAD_FREQ;
		9248
9048	/*	9249	/*
9049	* During early bootup we pretend to be a normal task:	9250	* During early bootup we pretend to be a normal task:
9050	*/	9251	*/
@@ -9055,6 +9256,7 @@ void __init sched_init(void)
9055	#ifdef CONFIG_SMP	9256	#ifdef CONFIG_SMP
9056	#ifdef CONFIG_NO_HZ	9257	#ifdef CONFIG_NO_HZ
9057	alloc_bootmem_cpumask_var(&nohz.cpu_mask);	9258	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
		9259	alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
9058	#endif	9260	#endif
9059	alloc_bootmem_cpumask_var(&cpu_isolated_map);	9261	alloc_bootmem_cpumask_var(&cpu_isolated_map);
9060	#endif /* SMP */	9262	#endif /* SMP */
@@ -9800,6 +10002,13 @@ static int sched_rt_global_constraints(void)
9800	if (sysctl_sched_rt_period <= 0)	10002	if (sysctl_sched_rt_period <= 0)
9801	return -EINVAL;	10003	return -EINVAL;
9802		10004
		10005	/*
		10006	* There's always some RT tasks in the root group
		10007	* -- migration, kstopmachine etc..
		10008	*/
		10009	if (sysctl_sched_rt_runtime == 0)
		10010	return -EBUSY;
		10011
9803	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);	10012	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
9804	for_each_possible_cpu(i) {	10013	for_each_possible_cpu(i) {
9805	struct rt_rq *rt_rq = &cpu_rq(i)->rt;	10014	struct rt_rq *rt_rq = &cpu_rq(i)->rt;