Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (29 commits) sched: Export account_system_vtime() sched: Call tick_check_idle before __irq_enter sched: Remove irq time from available CPU power sched: Do not account irq time to current task x86: Add IRQ_TIME_ACCOUNTING sched: Add IRQ_TIME_ACCOUNTING, finer accounting of irq time sched: Add a PF flag for ksoftirqd identification sched: Consolidate account_system_vtime extern declaration sched: Fix softirq time accounting sched: Drop group_capacity to 1 only if local group has extra capacity sched: Force balancing on newidle balance if local group has capacity sched: Set group_imb only a task can be pulled from the busiest cpu sched: Do not consider SCHED_IDLE tasks to be cache hot sched: Drop all load weight manipulation for RT tasks sched: Create special class for stop/migrate work sched: Unindent labels sched: Comment updates: fix default latency and granularity numbers tracing/sched: Add sched_pi_setprio tracepoint sched: Give CPU bound RT tasks preference sched: Try not to migrate higher priority RT tasks ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-21 15:55:43 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2010-10-21 15:55:43 -0400
commit: bc4016f48161454a9a8e5eb209b0693c6cde9f62 (patch)
tree: f470f5d711e975b152eec90282f5dd30a1d5dba5 /kernel/sched_fair.c
parent: 5d70f79b5ef6ea2de4f72a37b2d96e2601e40a22 (diff)
parent: b7dadc38797584f6203386da1947ed5edf516646 (diff)
1 files changed, 59 insertions, 17 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5f996d36ac5d..933f3d1b62ea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
 /*
 * Targeted preemption latency for CPU-bound tasks:
- * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
 *
 * NOTE: this latency value is not the same as the concept of
 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
 /*
 * Minimal preemption granularity for CPU-bound tasks:
- * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
 */
 unsigned int sysctl_sched_min_granularity = 750000ULL;
 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 static void update_curr(struct cfs_rq *cfs_rq)
 {
        struct sched_entity *curr = cfs_rq->curr;
-        u64 now = rq_of(cfs_rq)->clock;
+        u64 now = rq_of(cfs_rq)->clock_task;
        unsigned long delta_exec;
        if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
        /*
         * We are starting a new run period:
         */
-        se->exec_start = rq_of(cfs_rq)->clock;
+        se->exec_start = rq_of(cfs_rq)->clock_task;
 }
 /**************************************************
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
        set_task_cpu(p, this_cpu);
        activate_task(this_rq, p, 0);
        check_preempt_curr(this_rq, p, 0);
+        /* re-arm NEWIDLE balancing when moving tasks */
+        src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+        this_rq->idle_stamp = 0;
 }
 /*
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) too many balance attempts have failed.
         */
-        tsk_cache_hot = task_hot(p, rq->clock, sd);
+        tsk_cache_hot = task_hot(p, rq->clock_task, sd);
        if (!tsk_cache_hot ||
                sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
        unsigned long this_load;
        unsigned long this_load_per_task;
        unsigned long this_nr_running;
+        unsigned long this_has_capacity;
        /* Statistics of the busiest group */
        unsigned long max_load;
        unsigned long busiest_load_per_task;
        unsigned long busiest_nr_running;
        unsigned long busiest_group_capacity;
+        unsigned long busiest_has_capacity;
        int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long group_capacity;
        int group_imb; /* Is there an imbalance in the group ? */
+        int group_has_capacity; /* Is there extra capacity in the group? */
 };
 /**
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
        u64 total, available;
        total = sched_avg_period() + (rq->clock - rq->age_stamp);
-        available = total - rq->rt_avg;
+        if (unlikely(total < rq->rt_avg)) {
+                /* Ensures that power won't end up being negative */
+                available = 0;
+        } else {
+                available = total - rq->rt_avg;
+        }
        if (unlikely((s64)total < SCHED_LOAD_SCALE))
                total = SCHED_LOAD_SCALE;
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
 {
-        unsigned long load, max_cpu_load, min_cpu_load;
+        unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
        int i;
        unsigned int balance_cpu = -1, first_idle_cpu = 0;
        unsigned long avg_load_per_task = 0;
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
        min_cpu_load = ~0UL;
+        max_nr_running = 0;
        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                struct rq *rq = cpu_rq(i);
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        load = target_load(i, load_idx);
                } else {
                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
+                        if (load > max_cpu_load) {
                                max_cpu_load = load;
+                                max_nr_running = rq->nr_running;
+                        }
                        if (min_cpu_load > load)
                                min_cpu_load = load;
                }
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        if (sgs->sum_nr_running)
                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
                sgs->group_imb = 1;
-        sgs->group_capacity =
+        sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
-                DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
        if (!sgs->group_capacity)
                sgs->group_capacity = fix_small_capacity(sd, group);
+        if (sgs->group_capacity > sgs->sum_nr_running)
+                sgs->group_has_capacity = 1;
 }
 /**
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                /*
                 * In case the child domain prefers tasks go to siblings
                 * first, lower the sg capacity to one so that we'll try
-                 * and move all the excess tasks away.
+                 * and move all the excess tasks away. We lower the capacity
+                 * of a group only if the local group has the capacity to fit
+                 * these excess tasks, i.e. nr_running < group_capacity. The
+                 * extra check prevents the case where you always pull from the
+                 * heaviest group when it is already under-utilized (possible
+                 * with a large weight task outweighs the tasks on the system).
                 */
-                if (prefer_sibling)
+                if (prefer_sibling && !local_group && sds->this_has_capacity)
                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
                if (local_group) {
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->this = sg;
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
+                        sds->this_has_capacity = sgs.group_has_capacity;
                } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
                        sds->busiest_group_capacity = sgs.group_capacity;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
+                        sds->busiest_has_capacity = sgs.group_has_capacity;
                        sds->group_imb = sgs.group_imb;
                }
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                return fix_small_imbalance(sds, this_cpu, imbalance);
 }
 /******* find_busiest_group() helpers end here *********************/
 /**
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * 4) This group is more busy than the avg busieness at this
         *    sched_domain.
         * 5) The imbalance is within the specified limit.
+         *
+         * Note: when doing newidle balance, if the local group has excess
+         * capacity (i.e. nr_running < group_capacity) and the busiest group
+         * does not have any capacity, we force a load balance to pull tasks
+         * to the local group. In this case, we skip past checks 3, 4 and 5.
         */
        if (!(*balance))
                goto ret;
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (!sds.busiest || sds.busiest_nr_running == 0)
                goto out_balanced;
+        /*  SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+        if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+                        !sds.busiest_has_capacity)
+                goto force_balance;
        if (sds.this_load >= sds.max_load)
                goto out_balanced;
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
                goto out_balanced;
+force_balance:
        /* Looks like there is an imbalance. Compute it */
        calculate_imbalance(&sds, this_cpu, imbalance);
        return sds.busiest;
@@ -3031,7 +3068,14 @@ redo:
        if (!ld_moved) {
                schedstat_inc(sd, lb_failed[idle]);
-                sd->nr_balance_failed++;
+                /*
+                 * Increment the failure counter only on periodic balance.
+                 * We do not want newidle balance, which can be very
+                 * frequent, pollute the failure counter causing
+                 * excessive cache_hot migrations and active balances.
+                 */
+                if (idle != CPU_NEWLY_IDLE)
+                        sd->nr_balance_failed++;
                if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
                                        this_cpu)) {
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
-                if (pulled_task) {
+                if (pulled_task)
-                        this_rq->idle_stamp = 0;
                        break;
-                }
        }
        raw_spin_lock(&this_rq->lock);
author	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-21 15:55:43 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2010-10-21 15:55:43 -0400
commit	bc4016f48161454a9a8e5eb209b0693c6cde9f62 (patch)
tree	f470f5d711e975b152eec90282f5dd30a1d5dba5 /kernel/sched_fair.c
parent	5d70f79b5ef6ea2de4f72a37b2d96e2601e40a22 (diff)
parent	b7dadc38797584f6203386da1947ed5edf516646 (diff)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5f996d36ac5d..933f3d1b62ea 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
25		25
26	/*	26	/*
27	* Targeted preemption latency for CPU-bound tasks:	27	* Targeted preemption latency for CPU-bound tasks:
28	* (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)	28	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
29	*	29	*
30	* NOTE: this latency value is not the same as the concept of	30	* NOTE: this latency value is not the same as the concept of
31	* 'timeslice length' - timeslices in CFS are of variable length	31	* 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52		52
53	/*	53	/*
54	* Minimal preemption granularity for CPU-bound tasks:	54	* Minimal preemption granularity for CPU-bound tasks:
55	* (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)	55	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
56	*/	56	*/
57	unsigned int sysctl_sched_min_granularity = 750000ULL;	57	unsigned int sysctl_sched_min_granularity = 750000ULL;
58	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;	58	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
519	static void update_curr(struct cfs_rq *cfs_rq)	519	static void update_curr(struct cfs_rq *cfs_rq)
520	{	520	{
521	struct sched_entity *curr = cfs_rq->curr;	521	struct sched_entity *curr = cfs_rq->curr;
522	u64 now = rq_of(cfs_rq)->clock;	522	u64 now = rq_of(cfs_rq)->clock_task;
523	unsigned long delta_exec;	523	unsigned long delta_exec;
524		524
525	if (unlikely(!curr))	525	if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
602	/*	602	/*
603	* We are starting a new run period:	603	* We are starting a new run period:
604	*/	604	*/
605	se->exec_start = rq_of(cfs_rq)->clock;	605	se->exec_start = rq_of(cfs_rq)->clock_task;
606	}	606	}
607		607
608	/**************************************************	608	/**************************************************
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq src_rq, struct task_struct p,
1764	set_task_cpu(p, this_cpu);	1764	set_task_cpu(p, this_cpu);
1765	activate_task(this_rq, p, 0);	1765	activate_task(this_rq, p, 0);
1766	check_preempt_curr(this_rq, p, 0);	1766	check_preempt_curr(this_rq, p, 0);
		1767
		1768	/* re-arm NEWIDLE balancing when moving tasks */
		1769	src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
		1770	this_rq->idle_stamp = 0;
1767	}	1771	}
1768		1772
1769	/*	1773	/*
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct p, struct rq rq, int this_cpu,
1798	* 2) too many balance attempts have failed.	1802	* 2) too many balance attempts have failed.
1799	*/	1803	*/
1800		1804
1801	tsk_cache_hot = task_hot(p, rq->clock, sd);	1805	tsk_cache_hot = task_hot(p, rq->clock_task, sd);
1802	if (!tsk_cache_hot \|\|	1806	if (!tsk_cache_hot \|\|
1803	sd->nr_balance_failed > sd->cache_nice_tries) {	1807	sd->nr_balance_failed > sd->cache_nice_tries) {
1804	#ifdef CONFIG_SCHEDSTATS	1808	#ifdef CONFIG_SCHEDSTATS
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
2030	unsigned long this_load;	2034	unsigned long this_load;
2031	unsigned long this_load_per_task;	2035	unsigned long this_load_per_task;
2032	unsigned long this_nr_running;	2036	unsigned long this_nr_running;
		2037	unsigned long this_has_capacity;
2033		2038
2034	/* Statistics of the busiest group */	2039	/* Statistics of the busiest group */
2035	unsigned long max_load;	2040	unsigned long max_load;
2036	unsigned long busiest_load_per_task;	2041	unsigned long busiest_load_per_task;
2037	unsigned long busiest_nr_running;	2042	unsigned long busiest_nr_running;
2038	unsigned long busiest_group_capacity;	2043	unsigned long busiest_group_capacity;
		2044	unsigned long busiest_has_capacity;
2039		2045
2040	int group_imb; /* Is there imbalance in this sd */	2046	int group_imb; /* Is there imbalance in this sd */
2041	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)	2047	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
2058	unsigned long sum_weighted_load; /* Weighted load of group's tasks */	2064	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2059	unsigned long group_capacity;	2065	unsigned long group_capacity;
2060	int group_imb; /* Is there an imbalance in the group ? */	2066	int group_imb; /* Is there an imbalance in the group ? */
		2067	int group_has_capacity; /* Is there extra capacity in the group? */
2061	};	2068	};
2062		2069
2063	/**	2070	/**
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
2268	u64 total, available;	2275	u64 total, available;
2269		2276
2270	total = sched_avg_period() + (rq->clock - rq->age_stamp);	2277	total = sched_avg_period() + (rq->clock - rq->age_stamp);
2271	available = total - rq->rt_avg;	2278
		2279	if (unlikely(total < rq->rt_avg)) {
		2280	/* Ensures that power won't end up being negative */
		2281	available = 0;
		2282	} else {
		2283	available = total - rq->rt_avg;
		2284	}
2272		2285
2273	if (unlikely((s64)total < SCHED_LOAD_SCALE))	2286	if (unlikely((s64)total < SCHED_LOAD_SCALE))
2274	total = SCHED_LOAD_SCALE;	2287	total = SCHED_LOAD_SCALE;
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2378	int local_group, const struct cpumask *cpus,	2391	int local_group, const struct cpumask *cpus,
2379	int balance, struct sg_lb_stats sgs)	2392	int balance, struct sg_lb_stats sgs)
2380	{	2393	{
2381	unsigned long load, max_cpu_load, min_cpu_load;	2394	unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2382	int i;	2395	int i;
2383	unsigned int balance_cpu = -1, first_idle_cpu = 0;	2396	unsigned int balance_cpu = -1, first_idle_cpu = 0;
2384	unsigned long avg_load_per_task = 0;	2397	unsigned long avg_load_per_task = 0;
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2389	/* Tally up the load of all CPUs in the group */	2402	/* Tally up the load of all CPUs in the group */
2390	max_cpu_load = 0;	2403	max_cpu_load = 0;
2391	min_cpu_load = ~0UL;	2404	min_cpu_load = ~0UL;
		2405	max_nr_running = 0;
2392		2406
2393	for_each_cpu_and(i, sched_group_cpus(group), cpus) {	2407	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2394	struct rq *rq = cpu_rq(i);	2408	struct rq *rq = cpu_rq(i);
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2406	load = target_load(i, load_idx);	2420	load = target_load(i, load_idx);
2407	} else {	2421	} else {
2408	load = source_load(i, load_idx);	2422	load = source_load(i, load_idx);
2409	if (load > max_cpu_load)	2423	if (load > max_cpu_load) {
2410	max_cpu_load = load;	2424	max_cpu_load = load;
		2425	max_nr_running = rq->nr_running;
		2426	}
2411	if (min_cpu_load > load)	2427	if (min_cpu_load > load)
2412	min_cpu_load = load;	2428	min_cpu_load = load;
2413	}	2429	}
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2447	if (sgs->sum_nr_running)	2463	if (sgs->sum_nr_running)
2448	avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;	2464	avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2449		2465
2450	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)	2466	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
2451	sgs->group_imb = 1;	2467	sgs->group_imb = 1;
2452		2468
2453	sgs->group_capacity =	2469	sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2454	DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2455	if (!sgs->group_capacity)	2470	if (!sgs->group_capacity)
2456	sgs->group_capacity = fix_small_capacity(sd, group);	2471	sgs->group_capacity = fix_small_capacity(sd, group);
		2472
		2473	if (sgs->group_capacity > sgs->sum_nr_running)
		2474	sgs->group_has_capacity = 1;
2457	}	2475	}
2458		2476
2459	/**	2477	/**
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2542	/*	2560	/*
2543	* In case the child domain prefers tasks go to siblings	2561	* In case the child domain prefers tasks go to siblings
2544	* first, lower the sg capacity to one so that we'll try	2562	* first, lower the sg capacity to one so that we'll try
2545	* and move all the excess tasks away.	2563	* and move all the excess tasks away. We lower the capacity
		2564	* of a group only if the local group has the capacity to fit
		2565	* these excess tasks, i.e. nr_running < group_capacity. The
		2566	* extra check prevents the case where you always pull from the
		2567	* heaviest group when it is already under-utilized (possible
		2568	* with a large weight task outweighs the tasks on the system).
2546	*/	2569	*/
2547	if (prefer_sibling)	2570	if (prefer_sibling && !local_group && sds->this_has_capacity)
2548	sgs.group_capacity = min(sgs.group_capacity, 1UL);	2571	sgs.group_capacity = min(sgs.group_capacity, 1UL);
2549		2572
2550	if (local_group) {	2573	if (local_group) {
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2552	sds->this = sg;	2575	sds->this = sg;
2553	sds->this_nr_running = sgs.sum_nr_running;	2576	sds->this_nr_running = sgs.sum_nr_running;
2554	sds->this_load_per_task = sgs.sum_weighted_load;	2577	sds->this_load_per_task = sgs.sum_weighted_load;
		2578	sds->this_has_capacity = sgs.group_has_capacity;
2555	} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {	2579	} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2556	sds->max_load = sgs.avg_load;	2580	sds->max_load = sgs.avg_load;
2557	sds->busiest = sg;	2581	sds->busiest = sg;
2558	sds->busiest_nr_running = sgs.sum_nr_running;	2582	sds->busiest_nr_running = sgs.sum_nr_running;
2559	sds->busiest_group_capacity = sgs.group_capacity;	2583	sds->busiest_group_capacity = sgs.group_capacity;
2560	sds->busiest_load_per_task = sgs.sum_weighted_load;	2584	sds->busiest_load_per_task = sgs.sum_weighted_load;
		2585	sds->busiest_has_capacity = sgs.group_has_capacity;
2561	sds->group_imb = sgs.group_imb;	2586	sds->group_imb = sgs.group_imb;
2562	}	2587	}
2563		2588
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2754	return fix_small_imbalance(sds, this_cpu, imbalance);	2779	return fix_small_imbalance(sds, this_cpu, imbalance);
2755		2780
2756	}	2781	}
		2782
2757	/***** find_busiest_group() helpers end here *******************/	2783	/***** find_busiest_group() helpers end here *******************/
2758		2784
2759	/**	2785	/**
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2805	* 4) This group is more busy than the avg busieness at this	2831	* 4) This group is more busy than the avg busieness at this
2806	* sched_domain.	2832	* sched_domain.
2807	* 5) The imbalance is within the specified limit.	2833	* 5) The imbalance is within the specified limit.
		2834	*
		2835	* Note: when doing newidle balance, if the local group has excess
		2836	* capacity (i.e. nr_running < group_capacity) and the busiest group
		2837	* does not have any capacity, we force a load balance to pull tasks
		2838	* to the local group. In this case, we skip past checks 3, 4 and 5.
2808	*/	2839	*/
2809	if (!(*balance))	2840	if (!(*balance))
2810	goto ret;	2841	goto ret;
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2816	if (!sds.busiest \|\| sds.busiest_nr_running == 0)	2847	if (!sds.busiest \|\| sds.busiest_nr_running == 0)
2817	goto out_balanced;	2848	goto out_balanced;
2818		2849
		2850	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
		2851	if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
		2852	!sds.busiest_has_capacity)
		2853	goto force_balance;
		2854
2819	if (sds.this_load >= sds.max_load)	2855	if (sds.this_load >= sds.max_load)
2820	goto out_balanced;	2856	goto out_balanced;
2821		2857
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2827	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)	2863	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2828	goto out_balanced;	2864	goto out_balanced;
2829		2865
		2866	force_balance:
2830	/* Looks like there is an imbalance. Compute it */	2867	/* Looks like there is an imbalance. Compute it */
2831	calculate_imbalance(&sds, this_cpu, imbalance);	2868	calculate_imbalance(&sds, this_cpu, imbalance);
2832	return sds.busiest;	2869	return sds.busiest;
@@ -3031,7 +3068,14 @@ redo:
3031		3068
3032	if (!ld_moved) {	3069	if (!ld_moved) {
3033	schedstat_inc(sd, lb_failed[idle]);	3070	schedstat_inc(sd, lb_failed[idle]);
3034	sd->nr_balance_failed++;	3071	/*
		3072	* Increment the failure counter only on periodic balance.
		3073	* We do not want newidle balance, which can be very
		3074	* frequent, pollute the failure counter causing
		3075	* excessive cache_hot migrations and active balances.
		3076	*/
		3077	if (idle != CPU_NEWLY_IDLE)
		3078	sd->nr_balance_failed++;
3035		3079
3036	if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),	3080	if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3037	this_cpu)) {	3081	this_cpu)) {
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3153	interval = msecs_to_jiffies(sd->balance_interval);	3197	interval = msecs_to_jiffies(sd->balance_interval);
3154	if (time_after(next_balance, sd->last_balance + interval))	3198	if (time_after(next_balance, sd->last_balance + interval))
3155	next_balance = sd->last_balance + interval;	3199	next_balance = sd->last_balance + interval;
3156	if (pulled_task) {	3200	if (pulled_task)
3157	this_rq->idle_stamp = 0;
3158	break;	3201	break;
3159	}
3160	}	3202	}
3161		3203
3162	raw_spin_lock(&this_rq->lock);	3204	raw_spin_lock(&this_rq->lock);