4 files changed, 62 insertions, 22 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d0036e52a24a..2c79e921a68b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -862,6 +862,7 @@ struct sched_group {
         * single CPU.
         */
        unsigned int cpu_power, cpu_power_orig;
+        unsigned int group_weight;
        /*
         * The CPUs this group covers.
diff --git a/kernel/sched.c b/kernel/sched.c
index aa14a56f9d03..dc91a4d09ac3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -560,18 +560,8 @@ struct rq {
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static inline
-void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
-{
-        rq->curr->sched_class->check_preempt_curr(rq, p, flags);
-        /*
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-         * A queue event has occurred, and we're going to schedule.  In
-         * this case, we can save a useless back to back clock update.
-         */
-        if (test_tsk_need_resched(p))
-                rq->skip_clock_update = 1;
-}
 static inline int cpu_of(struct rq *rq)
 {
@@ -2118,6 +2108,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                p->sched_class->prio_changed(rq, p, oldprio, running);
 }
+static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+{
+        const struct sched_class *class;
+        if (p->sched_class == rq->curr->sched_class) {
+                rq->curr->sched_class->check_preempt_curr(rq, p, flags);
+        } else {
+                for_each_class(class) {
+                        if (class == rq->curr->sched_class)
+                                break;
+                        if (class == p->sched_class) {
+                                resched_task(rq->curr);
+                                break;
+                        }
+                }
+        }
+        /*
+         * A queue event has occurred, and we're going to schedule.  In
+         * this case, we can save a useless back to back clock update.
+         */
+        if (test_tsk_need_resched(rq->curr))
+                rq->skip_clock_update = 1;
+}
 #ifdef CONFIG_SMP
 /*
 * Is this task likely cache-hot:
@@ -6960,6 +6975,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        if (cpu != group_first_cpu(sd->groups))
                return;
+        sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
        child = sd->child;
        sd->groups->cpu_power = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4f6a8326dd0..52ab113d8bb9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1654,12 +1654,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        int scale = cfs_rq->nr_running >= sched_nr_latency;
-        if (unlikely(rt_prio(p->prio)))
-                goto preempt;
-        if (unlikely(p->sched_class != &fair_sched_class))
-                return;
        if (unlikely(se == pse))
                return;
@@ -2035,13 +2029,16 @@ struct sd_lb_stats {
        unsigned long this_load_per_task;
        unsigned long this_nr_running;
        unsigned long this_has_capacity;
+        unsigned int  this_idle_cpus;
        /* Statistics of the busiest group */
+        unsigned int  busiest_idle_cpus;
        unsigned long max_load;
        unsigned long busiest_load_per_task;
        unsigned long busiest_nr_running;
        unsigned long busiest_group_capacity;
        unsigned long busiest_has_capacity;
+        unsigned int  busiest_group_weight;
        int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2060,8 @@ struct sg_lb_stats {
        unsigned long sum_nr_running; /* Nr tasks running in the group */
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long group_capacity;
+        unsigned long idle_cpus;
+        unsigned long group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
 };
@@ -2431,7 +2430,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                sgs->group_load += load;
                sgs->sum_nr_running += rq->nr_running;
                sgs->sum_weighted_load += weighted_cpuload(i);
+                if (idle_cpu(i))
+                        sgs->idle_cpus++;
        }
        /*
@@ -2469,6 +2469,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
        if (!sgs->group_capacity)
                sgs->group_capacity = fix_small_capacity(sd, group);
+        sgs->group_weight = group->group_weight;
        if (sgs->group_capacity > sgs->sum_nr_running)
                sgs->group_has_capacity = 1;
@@ -2576,13 +2577,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
                        sds->this_has_capacity = sgs.group_has_capacity;
+                        sds->this_idle_cpus = sgs.idle_cpus;
                } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_idle_cpus = sgs.idle_cpus;
                        sds->busiest_group_capacity = sgs.group_capacity;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
                        sds->busiest_has_capacity = sgs.group_has_capacity;
+                        sds->busiest_group_weight = sgs.group_weight;
                        sds->group_imb = sgs.group_imb;
                }
@@ -2860,8 +2864,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (sds.this_load >= sds.avg_load)
                goto out_balanced;
-        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+        /*
-                goto out_balanced;
+         * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+         * And to check for busy balance use !idle_cpu instead of
+         * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+         * even when they are idle.
+         */
+        if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+                if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                        goto out_balanced;
+        } else {
+                /*
+                 * This cpu is idle. If the busiest group load doesn't
+                 * have more tasks than the number of available cpu's and
+                 * there is no imbalance between this and busiest group
+                 * wrt to idle cpu's, it is balanced.
+                 */
+                if ((sds.this_idle_cpus  <= sds.busiest_idle_cpus + 1) &&
+                    sds.busiest_nr_running <= sds.busiest_group_weight)
+                        goto out_balanced;
+        }
 force_balance:
        /* Looks like there is an imbalance. Compute it */
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 45bddc0c1048..2bf6b47058c1 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -19,14 +19,14 @@ select_task_rq_stop(struct rq *rq, struct task_struct *p,
 static void
 check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 {
-        resched_task(rq->curr); /* we preempt everything */
+        /* we're never preempted */
 }
 static struct task_struct *pick_next_task_stop(struct rq *rq)
 {
        struct task_struct *stop = rq->stop;
-        if (stop && stop->state == TASK_RUNNING)
+        if (stop && stop->se.on_rq)
                return stop;
        return NULL;

diff --git a/include/linux/sched.h b/include/linux/sched.h index d0036e52a24a..2c79e921a68b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -862,6 +862,7 @@ struct sched_group {
862	* single CPU.	862	* single CPU.
863	*/	863	*/
864	unsigned int cpu_power, cpu_power_orig;	864	unsigned int cpu_power, cpu_power_orig;
		865	unsigned int group_weight;
865		866
866	/*	867	/*
867	* The CPUs this group covers.	868	* The CPUs this group covers.


diff --git a/kernel/sched.c b/kernel/sched.c index aa14a56f9d03..dc91a4d09ac3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -560,18 +560,8 @@ struct rq {
560		560
561	static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);	561	static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
562		562
563	static inline
564	void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
565	{
566	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
567		563
568	/*	564	static void check_preempt_curr(struct rq rq, struct task_struct p, int flags);
569	* A queue event has occurred, and we're going to schedule. In
570	* this case, we can save a useless back to back clock update.
571	*/
572	if (test_tsk_need_resched(p))
573	rq->skip_clock_update = 1;
574	}
575		565
576	static inline int cpu_of(struct rq *rq)	566	static inline int cpu_of(struct rq *rq)
577	{	567	{
@@ -2118,6 +2108,31 @@ static inline void check_class_changed(struct rq rq, struct task_struct p,
2118	p->sched_class->prio_changed(rq, p, oldprio, running);	2108	p->sched_class->prio_changed(rq, p, oldprio, running);
2119	}	2109	}
2120		2110
		2111	static void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
		2112	{
		2113	const struct sched_class *class;
		2114
		2115	if (p->sched_class == rq->curr->sched_class) {
		2116	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
		2117	} else {
		2118	for_each_class(class) {
		2119	if (class == rq->curr->sched_class)
		2120	break;
		2121	if (class == p->sched_class) {
		2122	resched_task(rq->curr);
		2123	break;
		2124	}
		2125	}
		2126	}
		2127
		2128	/*
		2129	* A queue event has occurred, and we're going to schedule. In
		2130	* this case, we can save a useless back to back clock update.
		2131	*/
		2132	if (test_tsk_need_resched(rq->curr))
		2133	rq->skip_clock_update = 1;
		2134	}
		2135
2121	#ifdef CONFIG_SMP	2136	#ifdef CONFIG_SMP
2122	/*	2137	/*
2123	* Is this task likely cache-hot:	2138	* Is this task likely cache-hot:
@@ -6960,6 +6975,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6960	if (cpu != group_first_cpu(sd->groups))	6975	if (cpu != group_first_cpu(sd->groups))
6961	return;	6976	return;
6962		6977
		6978	sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
		6979
6963	child = sd->child;	6980	child = sd->child;
6964		6981
6965	sd->groups->cpu_power = 0;	6982	sd->groups->cpu_power = 0;


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a8326dd0..52ab113d8bb9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -1654,12 +1654,6 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_
1654	struct cfs_rq *cfs_rq = task_cfs_rq(curr);	1654	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1655	int scale = cfs_rq->nr_running >= sched_nr_latency;	1655	int scale = cfs_rq->nr_running >= sched_nr_latency;
1656		1656
1657	if (unlikely(rt_prio(p->prio)))
1658	goto preempt;
1659
1660	if (unlikely(p->sched_class != &fair_sched_class))
1661	return;
1662
1663	if (unlikely(se == pse))	1657	if (unlikely(se == pse))
1664	return;	1658	return;
1665		1659
@@ -2035,13 +2029,16 @@ struct sd_lb_stats {
2035	unsigned long this_load_per_task;	2029	unsigned long this_load_per_task;
2036	unsigned long this_nr_running;	2030	unsigned long this_nr_running;
2037	unsigned long this_has_capacity;	2031	unsigned long this_has_capacity;
		2032	unsigned int this_idle_cpus;
2038		2033
2039	/* Statistics of the busiest group */	2034	/* Statistics of the busiest group */
		2035	unsigned int busiest_idle_cpus;
2040	unsigned long max_load;	2036	unsigned long max_load;
2041	unsigned long busiest_load_per_task;	2037	unsigned long busiest_load_per_task;
2042	unsigned long busiest_nr_running;	2038	unsigned long busiest_nr_running;
2043	unsigned long busiest_group_capacity;	2039	unsigned long busiest_group_capacity;
2044	unsigned long busiest_has_capacity;	2040	unsigned long busiest_has_capacity;
		2041	unsigned int busiest_group_weight;
2045		2042
2046	int group_imb; /* Is there imbalance in this sd */	2043	int group_imb; /* Is there imbalance in this sd */
2047	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)	2044	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
@@ -2063,6 +2060,8 @@ struct sg_lb_stats {
2063	unsigned long sum_nr_running; /* Nr tasks running in the group */	2060	unsigned long sum_nr_running; /* Nr tasks running in the group */
2064	unsigned long sum_weighted_load; /* Weighted load of group's tasks */	2061	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2065	unsigned long group_capacity;	2062	unsigned long group_capacity;
		2063	unsigned long idle_cpus;
		2064	unsigned long group_weight;
2066	int group_imb; /* Is there an imbalance in the group ? */	2065	int group_imb; /* Is there an imbalance in the group ? */
2067	int group_has_capacity; /* Is there extra capacity in the group? */	2066	int group_has_capacity; /* Is there extra capacity in the group? */
2068	};	2067	};
@@ -2431,7 +2430,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2431	sgs->group_load += load;	2430	sgs->group_load += load;
2432	sgs->sum_nr_running += rq->nr_running;	2431	sgs->sum_nr_running += rq->nr_running;
2433	sgs->sum_weighted_load += weighted_cpuload(i);	2432	sgs->sum_weighted_load += weighted_cpuload(i);
2434		2433	if (idle_cpu(i))
		2434	sgs->idle_cpus++;
2435	}	2435	}
2436		2436
2437	/*	2437	/*
@@ -2469,6 +2469,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2469	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);	2469	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2470	if (!sgs->group_capacity)	2470	if (!sgs->group_capacity)
2471	sgs->group_capacity = fix_small_capacity(sd, group);	2471	sgs->group_capacity = fix_small_capacity(sd, group);
		2472	sgs->group_weight = group->group_weight;
2472		2473
2473	if (sgs->group_capacity > sgs->sum_nr_running)	2474	if (sgs->group_capacity > sgs->sum_nr_running)
2474	sgs->group_has_capacity = 1;	2475	sgs->group_has_capacity = 1;
@@ -2576,13 +2577,16 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2576	sds->this_nr_running = sgs.sum_nr_running;	2577	sds->this_nr_running = sgs.sum_nr_running;
2577	sds->this_load_per_task = sgs.sum_weighted_load;	2578	sds->this_load_per_task = sgs.sum_weighted_load;
2578	sds->this_has_capacity = sgs.group_has_capacity;	2579	sds->this_has_capacity = sgs.group_has_capacity;
		2580	sds->this_idle_cpus = sgs.idle_cpus;
2579	} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {	2581	} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2580	sds->max_load = sgs.avg_load;	2582	sds->max_load = sgs.avg_load;
2581	sds->busiest = sg;	2583	sds->busiest = sg;
2582	sds->busiest_nr_running = sgs.sum_nr_running;	2584	sds->busiest_nr_running = sgs.sum_nr_running;
		2585	sds->busiest_idle_cpus = sgs.idle_cpus;
2583	sds->busiest_group_capacity = sgs.group_capacity;	2586	sds->busiest_group_capacity = sgs.group_capacity;
2584	sds->busiest_load_per_task = sgs.sum_weighted_load;	2587	sds->busiest_load_per_task = sgs.sum_weighted_load;
2585	sds->busiest_has_capacity = sgs.group_has_capacity;	2588	sds->busiest_has_capacity = sgs.group_has_capacity;
		2589	sds->busiest_group_weight = sgs.group_weight;
2586	sds->group_imb = sgs.group_imb;	2590	sds->group_imb = sgs.group_imb;
2587	}	2591	}
2588		2592
@@ -2860,8 +2864,26 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2860	if (sds.this_load >= sds.avg_load)	2864	if (sds.this_load >= sds.avg_load)
2861	goto out_balanced;	2865	goto out_balanced;
2862		2866
2863	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)	2867	/*
2864	goto out_balanced;	2868	* In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
		2869	* And to check for busy balance use !idle_cpu instead of
		2870	* CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
		2871	* even when they are idle.
		2872	*/
		2873	if (idle == CPU_NEWLY_IDLE \|\| !idle_cpu(this_cpu)) {
		2874	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
		2875	goto out_balanced;
		2876	} else {
		2877	/*
		2878	* This cpu is idle. If the busiest group load doesn't
		2879	* have more tasks than the number of available cpu's and
		2880	* there is no imbalance between this and busiest group
		2881	* wrt to idle cpu's, it is balanced.
		2882	*/
		2883	if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
		2884	sds.busiest_nr_running <= sds.busiest_group_weight)
		2885	goto out_balanced;
		2886	}
2865		2887
2866	force_balance:	2888	force_balance:
2867	/* Looks like there is an imbalance. Compute it */	2889	/* Looks like there is an imbalance. Compute it */


diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 45bddc0c1048..2bf6b47058c1 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c
@@ -19,14 +19,14 @@ select_task_rq_stop(struct rq rq, struct task_struct p,
19	static void	19	static void
20	check_preempt_curr_stop(struct rq rq, struct task_struct p, int flags)	20	check_preempt_curr_stop(struct rq rq, struct task_struct p, int flags)
21	{	21	{
22	resched_task(rq->curr); /* we preempt everything */	22	/* we're never preempted */
23	}	23	}
24		24
25	static struct task_struct pick_next_task_stop(struct rq rq)	25	static struct task_struct pick_next_task_stop(struct rq rq)
26	{	26	{
27	struct task_struct *stop = rq->stop;	27	struct task_struct *stop = rq->stop;
28		28
29	if (stop && stop->state == TASK_RUNNING)	29	if (stop && stop->se.on_rq)
30	return stop;	30	return stop;
31		31
32	return NULL;	32	return NULL;